diff --git a/dlib/cuda/cpu_dlib.cpp b/dlib/cuda/cpu_dlib.cpp index 11b7518e71..c55d1aa04e 100644 --- a/dlib/cuda/cpu_dlib.cpp +++ b/dlib/cuda/cpu_dlib.cpp @@ -1494,7 +1494,6 @@ namespace dlib } p_scale[n] = 1.0f / std::sqrt(p_scale[n] / (ks * num) + static_cast(eps)); } - scale.host(); // Apply RMS normalization p_src = src.host(); @@ -1648,14 +1647,22 @@ namespace dlib for (long k = 0; k < num_channels; ++k) max_val = std::max(max_val, ss[k * num_locations]); - float sum = 0.0f; - for (long k = 0; k < num_channels; ++k) + if (max_val == -std::numeric_limits::infinity()) { - dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val); - sum += dd[k * num_locations]; + for (long k = 0; k < num_channels; ++k) + dd[k * num_locations] = 0.0f; + } + else + { + float sum = 0.0f; + for (long k = 0; k < num_channels; ++k) + { + dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val); + sum += dd[k * num_locations]; + } + for (long k = 0; k < num_channels; ++k) + dd[k * num_locations] /= sum; } - for (long k = 0; k < num_channels; ++k) - dd[k * num_locations] /= sum; ++ss; ++dd; @@ -3373,6 +3380,69 @@ namespace dlib } } + // ------------------------------------------------------------------------------------ + + void apply_rotary_positional_embedding( + bool is_backward, + resizable_tensor& data, + const resizable_tensor& cos_cache, + const resizable_tensor& sin_cache) + { + const long batch_size = data.num_samples(); + const long num_heads = data.k(); + const long seq_len = data.nr(); + const long d_head = data.nc(); + const long half_d = d_head / 2; + + DLIB_CASSERT(cos_cache.nr() == seq_len, "cos_cache rows must match seq_len"); + DLIB_CASSERT(cos_cache.nc() == half_d, "cos_cache cols must be d_head/2"); + DLIB_CASSERT(sin_cache.nr() == seq_len, "sin_cache rows must match seq_len"); + DLIB_CASSERT(sin_cache.nc() == half_d, "sin_cache cols must be d_head/2"); + + const bool is_odd = (d_head % 2 != 0); + const long rot_dim = is_odd ? d_head - 1 : d_head; + + float* data_ptr = data.host(); + const float* cos_ptr = cos_cache.host(); + const float* sin_ptr = sin_cache.host(); + + const size_t total_elements = batch_size * num_heads * seq_len * half_d; + + parallel_for(0, total_elements, [&](long idx) + { + const long pair_idx = idx % half_d; + const long pos = (idx / half_d) % seq_len; + const long head = (idx / (half_d * seq_len)) % num_heads; + const long batch = idx / (half_d * seq_len * num_heads); + + const long dim_i = pair_idx * 2; + if (dim_i >= rot_dim) return; + + const long data_offset = ((batch * num_heads + head) * seq_len + pos) * d_head + dim_i; + const long trig_offset = pos * half_d + pair_idx; + + const float c = cos_ptr[trig_offset]; + const float s = sin_ptr[trig_offset]; + const float x0 = data_ptr[data_offset]; + const float x1 = data_ptr[data_offset + 1]; + + if (!is_backward) + { + // Forward: [cos -sin] [x0] + // [sin cos] [x1] + data_ptr[data_offset] = x0 * c - x1 * s; + data_ptr[data_offset + 1] = x0 * s + x1 * c; + } + else + { + // Backward (inverse rotation): [cos sin] [x0] + // [-sin cos] [x1] + data_ptr[data_offset] = x0 * c + x1 * s; + data_ptr[data_offset + 1] = -x0 * s + x1 * c; + } + }); + } + // ------------------------------------------------------------------------------------ } diff --git a/dlib/cuda/cpu_dlib.h b/dlib/cuda/cpu_dlib.h index 8c82ee856c..547142544c 100644 --- a/dlib/cuda/cpu_dlib.h +++ b/dlib/cuda/cpu_dlib.h @@ -586,6 +586,15 @@ namespace dlib float scale_factor ); + // ----------------------------------------------------------------------------------- + + void apply_rotary_positional_embedding( + bool is_backward, + resizable_tensor& data, + const resizable_tensor& cos_cache, + const resizable_tensor& sin_cache + ); + // ----------------------------------------------------------------------------------- class pooling @@ -763,6 +772,138 @@ namespace dlib // ----------------------------------------------------------------------------------- + class compute_loss_cross_entropy_per_logit + { + /*! + Computes cross-entropy loss for causal language modeling + Uses all sequence positions (except last) for training + Each position t predicts the token at position t+1 + !*/ + public: + compute_loss_cross_entropy_per_logit() {} + + template + void operator()( + const_label_iterator truth, + const tensor& input_tensor, + const tensor& output_tensor, + tensor& grad, + double& loss, + long ignore_index + ) const + { + DLIB_CASSERT(output_tensor.k() == 1); + DLIB_CASSERT(input_tensor.k() == 1); + DLIB_CASSERT(input_tensor.nc() == 1); + + const long batch_size = output_tensor.num_samples(); + const long seq_len = output_tensor.nr(); + const long vocab_size = output_tensor.nc(); + + const float* out_data = output_tensor.host(); + const float* in_data = input_tensor.host(); + float* g = grad.host(); + + std::fill(g, g + grad.size(), 0.0f); + + long valid_tokens = 0; + + if (ignore_index < 0) + { + valid_tokens = batch_size * seq_len; + } + else { + for (long i = 0; i < batch_size; ++i) + { + for (long t = 0; t < seq_len; ++t) + { + unsigned long target_class; + if (t < seq_len - 1) { + target_class = static_cast( + in_data[tensor_index(input_tensor, i, 0, t + 1, 0)] + ); + } + else + target_class = *(truth + i); + + if (static_cast(target_class) != ignore_index) + valid_tokens++; + } + } + } + if (valid_tokens == 0) + { + loss = 0.0; + return; + } + + const double scale = 1.0 / valid_tokens; + loss = 0.0; + + for (long i = 0; i < batch_size; ++i) + { + // Loop over all positions (0 to seq_len-1) + for (long t = 0; t < seq_len; ++t) + { + unsigned long target_class; + + // Extract target token + if (t < seq_len - 1) { + // For positions 0 to seq_len-2: target from input_tensor[t+1] + target_class = static_cast( + in_data[tensor_index(input_tensor, i, 0, t + 1, 0)] + ); + } else { + // For last position (seq_len-1): target from truth + target_class = *(truth + i); + } + + if (ignore_index >= 0 && static_cast(target_class) == ignore_index) + continue; + + DLIB_CASSERT(target_class < static_cast(vocab_size)); + + // Find max logit for numerical stability + float max_val = out_data[tensor_index(output_tensor, i, 0, t, 0)]; + for (long c = 1; c < vocab_size; ++c) + { + const float val = out_data[tensor_index(output_tensor, i, 0, t, c)]; + max_val = std::max(max_val, val); + } + + // Compute softmax denominator + float sum_exp = 0.0f; + for (long c = 0; c < vocab_size; ++c) + { + const unsigned long idx = tensor_index(output_tensor, i, 0, t, c); + const float exp_val = std::exp(out_data[idx] - max_val); + g[idx] = exp_val; + sum_exp += exp_val; + } + + // Compute loss and gradients + for (long c = 0; c < vocab_size; ++c) + { + const unsigned long idx = tensor_index(output_tensor, i, 0, t, c); + const float softmax_val = g[idx] / sum_exp; + + if (static_cast(c) == target_class) + { + loss += scale * (-std::log(std::max(softmax_val, 1e-10f))); + g[idx] = scale * (softmax_val - 1.0f); + } + else + { + g[idx] = scale * softmax_val; + } + } + } + } + } + }; + + // ----------------------------------------------------------------------------------- + class compute_loss_binary_log_per_pixel { diff --git a/dlib/cuda/cublas_dlibapi.cpp b/dlib/cuda/cublas_dlibapi.cpp index 064e92c3df..3e4c38d8e8 100644 --- a/dlib/cuda/cublas_dlibapi.cpp +++ b/dlib/cuda/cublas_dlibapi.cpp @@ -159,16 +159,21 @@ namespace dlib const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N; const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N; - long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() }); - long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() }); - - auto is_matrix = [](const auto& tensor) { - return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) || - (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1)); - }; - const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest); - - if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) num_samples = num_channels = 1; + const bool lhs_is_matrix = is_2d_matrix(lhs); + const bool rhs_is_matrix = is_2d_matrix(rhs); + const bool dest_is_matrix = is_2d_matrix(dest); + + const size_t lhs_plane_size = lhs.nr() * lhs.nc(); + const size_t rhs_plane_size = rhs.nr() * rhs.nc(); + const size_t dest_plane_size = dest.nr() * dest.nc(); + + long num_samples, num_channels = std::min({ lhs.k(), rhs.k(), dest.k() }); + if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) + num_samples = 1; + else if (!lhs_is_matrix && rhs_is_matrix) + num_samples = lhs.num_samples(); + else + num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() }); size_t lhs_rows = lhs.nr(); size_t lhs_cols = lhs.nc(); @@ -176,12 +181,14 @@ namespace dlib lhs_rows = lhs.num_samples(); lhs_cols = lhs.k(); } + size_t rhs_rows = rhs.nr(); size_t rhs_cols = rhs.nc(); if (rhs_is_matrix && (rhs.num_samples() > 1 || rhs.k() > 1)) { rhs_rows = rhs.num_samples(); rhs_cols = rhs.k(); } + size_t dest_rows = dest.nr(); size_t dest_cols = dest.nc(); if (dest_is_matrix && (dest.num_samples() > 1 || dest.k() > 1)) { @@ -189,10 +196,6 @@ namespace dlib dest_cols = dest.k(); } - const size_t lhs_plane_size = lhs_rows * lhs_cols; - const size_t rhs_plane_size = rhs_rows * rhs_cols; - const size_t dest_plane_size = dest_rows * dest_cols; - for (long b = 0; b < num_samples; ++b) { for (long c = 0; c < num_channels; ++c) @@ -203,12 +206,18 @@ namespace dlib rhs.device() + (b * num_channels + c) * rhs_plane_size; auto dest_slice = dest_is_matrix ? dest.device() : dest.device() + (b * num_channels + c) * dest_plane_size; + const int k = trans_rhs ? rhs_cols : rhs_rows; CHECK_CUBLAS(cublasSgemm( - context(), transb, transa, dest_cols, dest_rows, k, - &alpha, rhs_slice, rhs_cols, lhs_slice, lhs_cols, - &beta, dest_slice, dest_cols + context(), + transb, transa, + dest_cols, dest_rows, k, + &alpha, + rhs_slice, rhs_cols, + lhs_slice, lhs_cols, + &beta, + dest_slice, dest_cols )); } } diff --git a/dlib/cuda/cuda_dlib.cu b/dlib/cuda/cuda_dlib.cu index 82522929ef..b22b447643 100644 --- a/dlib/cuda/cuda_dlib.cu +++ b/dlib/cuda/cuda_dlib.cu @@ -1,3223 +1,3355 @@ -// Copyright (C) 2015 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. - -#include "cuda_utils.h" -#include "cuda_dlib.h" -#include "cudnn_dlibapi.h" -#include - - -namespace dlib -{ - namespace cuda - { - - // ----------------------------------------------------------------------------------- - - void set_device ( - int dev - ) - { - CHECK_CUDA(cudaSetDevice(dev)); - } - - int get_device ( - ) - { - int dev = 0; - CHECK_CUDA(cudaGetDevice(&dev)); - return dev; - } - - std::string get_device_name ( - int device - ) - { - cudaDeviceProp props; - CHECK_CUDA(cudaGetDeviceProperties(&props, device)); - return props.name; - } - - void set_current_device_blocking_sync( - ) - { - CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); - } - - int get_num_devices ( - ) - { - int num_devices; - CHECK_CUDA(cudaGetDeviceCount(&num_devices)); - return num_devices; - } - - bool can_access_peer (int device_id, int peer_device_id) - { - int can_access; - CHECK_CUDA(cudaDeviceCanAccessPeer(&can_access, device_id, peer_device_id)); - return can_access != 0; - } - bool can_access_peer (const tensor& device, const tensor& peer_device) - { - return can_access_peer(device.device_id(), peer_device.device_id()); - } - - void device_synchronize (int dev) - { - raii_set_device set_dev(dev); - CHECK_CUDA(cudaDeviceSynchronize()); - } - void device_synchronize (const tensor& dev) { device_synchronize(dev.device_id()); } - - enable_peer_access:: - enable_peer_access( - int device_id, - int peer_device_id - ) : call_disable(false), device_id(device_id), peer_device_id(peer_device_id) - { - raii_set_device set_dev(device_id); - - auto err = cudaDeviceEnablePeerAccess(peer_device_id, 0); - if (err == cudaSuccess) - { - call_disable = true; - } - else if (err == cudaErrorPeerAccessAlreadyEnabled) - { - // call cudaGetLastError() to dispose of this error since we don't - // care. - auto err2 = cudaGetLastError(); - if (err2 != cudaErrorPeerAccessAlreadyEnabled) - CHECK_CUDA(err2); - } - else - { - CHECK_CUDA(err); - } - } - - - enable_peer_access:: - ~enable_peer_access() noexcept(false) - { - if (call_disable) - { - raii_set_device set_dev(device_id); - CHECK_CUDA(cudaDeviceDisablePeerAccess(peer_device_id)); - } - } - - // ----------------------------------------------------------------------------------- - // ----------------------------------------------------------------------------------- - // ----------------------------------------------------------------------------------- - - __global__ void _cuda_inverse_norms(float* invnorms, const float* data, size_t nr, size_t nc, const float eps) - { - // initialize invnorms before we begin. - for (auto i : grid_stride_range_y(0, nr)) - for (auto j : grid_stride_range(0, 1)) - invnorms[i] = eps; - __syncthreads(); - - for (auto i : grid_stride_range_y(0, nr)) - { - auto p = data + i*nc; - float temp = 0; - for (auto j : grid_stride_range(0, nc)) - temp += p[j]*p[j]; - - // and store the sum into invnorms[i] - warp_reduce_atomic_add(invnorms[i], temp); - } - __syncthreads(); - - for (auto i : grid_stride_range_y(0, nr)) - for (auto j : grid_stride_range(0, 1)) - invnorms[i] = 1.0/std::sqrt(invnorms[i]); - } - - void inverse_norms ( - resizable_tensor& invnorms, - const tensor& data, - const double eps - ) - { - invnorms.set_size(data.num_samples()); - launch_kernel(_cuda_inverse_norms, max_jobs(data.size()/data.num_samples(), data.num_samples()), - invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_dot_prods(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc) - { - // initialize out before we begin. - for (auto i : grid_stride_range_y(0, nr)) - for (auto j : grid_stride_range(0, 1)) - out[i] = 0; - __syncthreads(); - - for (auto i : grid_stride_range_y(0, nr)) - { - auto l = lhs + i*nc; - auto r = rhs + i*nc; - float temp = 0; - for (auto j : grid_stride_range(0, nc)) - temp += l[j]*r[j]; - - // and store the sum into out[i] - warp_reduce_atomic_add(out[i], temp); - } - } - - __global__ void _cuda_dot_prods_add_to(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc) - { - for (auto i : grid_stride_range_y(0, nr)) - { - auto l = lhs + i*nc; - auto r = rhs + i*nc; - float temp = 0; - for (auto j : grid_stride_range(0, nc)) - temp += l[j]*r[j]; - - // and store the sum into out[i] - warp_reduce_atomic_add(out[i], temp); - } - } - - void dot_prods ( - resizable_tensor& out, - const tensor& lhs, - const tensor& rhs - ) - { - DLIB_CASSERT(have_same_dimensions(lhs,rhs)); - - out.set_size(lhs.num_samples()); - if (out.size() == 0) - return; - - const auto nr = lhs.num_samples(); - const auto nc = lhs.size()/lhs.num_samples(); - - launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc); - } - - void dot_prods ( - bool add_to, - tensor& out, - const tensor& lhs, - const tensor& rhs - ) - { - DLIB_CASSERT(have_same_dimensions(lhs,rhs)); - DLIB_CASSERT(out.k() == 1 && out.nr() == 1 && out.nc() == 1); - DLIB_CASSERT(out.size() == lhs.num_samples()); - - const auto nr = lhs.num_samples(); - const auto nc = lhs.size()/lhs.num_samples(); - - if (add_to) - launch_kernel(_cuda_dot_prods_add_to, max_jobs(nc,nr), out.device(), lhs.device(), rhs.device(), nr, nc); - else - launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_scale_columns(float* out, const float* m, const float* v, size_t nr, size_t nc) - { - for (auto j : grid_stride_range(0, nr*nc)) - { - out[j] = m[j]*v[j%nc]; - } - } - - void scale_columns ( - tensor& out, - const tensor& m, - const tensor& v - ) - { - launch_kernel(_cuda_scale_columns, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_scale_rows(float* out, const float* m, const float* v, size_t nr, size_t nc) - { - for (auto j : grid_stride_range(0, nr*nc)) - { - out[j] = m[j]*v[j/nc]; - } - } - - void scale_rows ( - tensor& out, - const tensor& m, - const tensor& v - ) - { - launch_kernel(_cuda_scale_rows, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_scale_rows2(float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc) - { - for (auto j : grid_stride_range(0, nr*nc)) - { - out[j] = (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc]; - } - } - - __global__ void _cuda_scale_rows2_beta(const float beta, float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc) - { - for (auto j : grid_stride_range(0, nr*nc)) - { - out[j] = beta*out[j] + (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc]; - } - } - - void scale_rows2 ( - float beta, - tensor& out, - const tensor& m1, - const tensor& m2, - const tensor& v1, - const tensor& v2 - ) - { - if (beta == 0) - { - launch_kernel(_cuda_scale_rows2, max_jobs(m1.size()), out.device(), - m1.device(), m2.device(), v1.device(), v2.device(), m1.num_samples(), - m1.size()/m1.num_samples()); - } - else - { - launch_kernel(_cuda_scale_rows2_beta, max_jobs(m1.size()), beta, - out.device(), m1.device(), m2.device(), v1.device(), v2.device(), - m1.num_samples(), m1.size()/m1.num_samples()); - } - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_exp(float* dest, const float* src, size_t n) - { - for (auto i : grid_stride_range(0, n)) - dest[i] = ::exp(src[i]); - } - - void exp ( - tensor& dest, - const tensor& src - ) - { - DLIB_ASSERT(dest.size() == src.size()); - launch_kernel(_cuda_exp, max_jobs(src.size()), dest.device(), src.device(), src.size()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_log(float* dest, const float* src, size_t n) - { - for (auto i : grid_stride_range(0, n)) - dest[i] = ::log(src[i]); - } - - void log ( - tensor& dest, - const tensor& src - ) - { - DLIB_ASSERT(dest.size() == src.size()); - launch_kernel(_cuda_log, max_jobs(src.size()), dest.device(), src.device(), src.size()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_log10(float* dest, const float* src, size_t n) - { - for (auto i : grid_stride_range(0, n)) - dest[i] = ::log10(src[i]); - } - - void log10 ( - tensor& dest, - const tensor& src - ) - { - DLIB_ASSERT(dest.size() == src.size()); - launch_kernel(_cuda_log10, max_jobs(src.size()), dest.device(), src.device(), src.size()); - } - - // ----------------------------------------------------------------------------------- - - __global__ void _cuda_multiply1(float* d, const float* s1, const float* s2, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = s1[i]*s2[i]; - } - } - __global__ void _cuda_multiply2(float* d, const float* s1, const float* s2, - size_t n, size_t s1_n, size_t s2_n, size_t max_size) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = 0; - for (size_t j = i; j < max_size; j += n) - d[i] += s1[j%s1_n]*s2[j%s2_n]; - } - } - - __global__ void _cuda_multiply3(float* d, const float* s1, const float* s2, - size_t n, size_t s1_n, size_t s2_n) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = s1[i%s1_n]*s2[i%s2_n]; - } - } - - __global__ void _cuda_multiply1_add_to(float* d, const float* s1, const float* s2, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] += s1[i]*s2[i]; - } - } - __global__ void _cuda_multiply2_add_to(float* d, const float* s1, const float* s2, - size_t n, size_t s1_n, size_t s2_n, size_t max_size) - { - for (auto i : grid_stride_range(0, n)) - { - for (size_t j = i; j < max_size; j += n) - d[i] += s1[j%s1_n]*s2[j%s2_n]; - } - } - - __global__ void _cuda_multiply3_add_to(float* d, const float* s1, const float* s2, - size_t n, size_t s1_n, size_t s2_n) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] += s1[i%s1_n]*s2[i%s2_n]; - } - } - - void multiply ( - bool add_to, - tensor& dest, - const tensor& src1, - const tensor& src2 - ) - { - - DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() && - dest.nr() == src1.nr() && src1.nr() == src2.nr() && - dest.nc() == src1.nc() && src1.nc() == src2.nc() ); - const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples()); - DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) && - (src1.num_samples()==1 || src1.num_samples()==MD) && - (src2.num_samples()==1 || src2.num_samples()==MD) ); - - if (dest.size() == 0) - return; - - const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size()); - const auto d = dest.host(); - const auto s1 = src1.host(); - const auto s2 = src2.host(); - if (dest.size() == src1.size() && src1.size() == src2.size()) - { - if (add_to) - launch_kernel(_cuda_multiply1_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size()); - else - launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size()); - } - else if (dest.num_samples() == 1) - { - if (add_to) - launch_kernel(_cuda_multiply2_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), - dest.size(), src1.size(), src2.size(), max_size); - else - launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), - dest.size(), src1.size(), src2.size(), max_size); - } - else - { - if (add_to) - launch_kernel(_cuda_multiply3_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), - dest.size(), src1.size(), src2.size()); - else - launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), - dest.size(), src1.size(), src2.size()); - } - } - - // ------------------------------------------------------------------------------------ - - __global__ void _cuda_multiply_conv(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) - { - for (auto i : grid_stride_range(0, n)) - { - auto k = (i/bs)%ks; - d[i] = s1[i]*s2[k]; - } - } - - __global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) - { - // zero initialize d before we begin. - for (auto i : grid_stride_range_y(0, ks)) - for (auto j : grid_stride_range(0, 1)) - d[i] = 0; - __syncthreads(); - - // loop over all the image planes - for (auto i : grid_stride_range_y(0, n)) - { - // sum all the elements in the i-th image plane - float temp = 0; - for (auto j : grid_stride_range(i*bs, (i+1)*bs)) - temp += s1[j]*s2[j]; - auto k = i%ks; - // and store the sum into d[k] - warp_reduce_atomic_add(d[k], temp); - } - } - - __global__ void _cuda_multiply_conv_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) - { - for (auto i : grid_stride_range(0, n)) - { - auto k = (i/bs)%ks; - d[i] += s1[i]*s2[k]; - } - } - - __global__ void _cuda_multiply_conv2_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) - { - // loop over all the image planes - for (auto i : grid_stride_range_y(0, n)) - { - // sum all the elements in the i-th image plane - float temp = 0; - for (auto j : grid_stride_range(i*bs, (i+1)*bs)) - temp += s1[j]*s2[j]; - auto k = i%ks; - // and store the sum into d[k] - warp_reduce_atomic_add(d[k], temp); - } - } - - - void multiply_conv ( - bool add_to, - tensor& dest, - const tensor& src1, - const tensor& src2 - ) - { - if (have_same_dimensions(dest,src1)) - { - DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k()); - if (dest.size() == 0) - return; - - if (add_to) - launch_kernel(_cuda_multiply_conv_add_to,max_jobs(dest.size()), - dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k()); - else - launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()), - dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k()); - } - else - { - DLIB_CASSERT(have_same_dimensions(src1,src2)); - DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k()); - if (dest.size() == 0) - return; - - - const auto bs = src1.nr()*src1.nc(); - const auto n = src1.num_samples()*src1.k(); - if (add_to) - launch_kernel(_cuda_multiply_conv2_add_to, max_jobs(bs,n), - dest.device(), src1.device(), n, src2.device(), bs, src1.k()); - else - launch_kernel(_cuda_multiply_conv2, max_jobs(bs,n), - dest.device(), src1.device(), n, src2.device(), bs, src1.k()); - } - - } - - // ------------------------------------------------------------------------------------ - - __global__ void _cuda_scale_channels_add_to(float* d, const float* src, size_t n, const float* scales, size_t bs) - { - for (auto i : grid_stride_range(0, n)) - { - auto k = i/bs; - d[i] += src[i]*scales[k]; - } - } - - __global__ void _cuda_scale_channels(float* d, const float* src, size_t n, const float* scales, size_t bs) - { - for (auto i : grid_stride_range(0, n)) - { - auto k = i/bs; - d[i] = src[i]*scales[k]; - } - } - - void scale_channels ( - bool add_to, - tensor& dest, - const tensor& src, - const tensor& scales - ) - { - DLIB_CASSERT(have_same_dimensions(dest,src) && - scales.num_samples() == src.num_samples() && - scales.k() == src.k() && - scales.nr() == 1 && - scales.nc() == 1 ); - - if (dest.size() == 0) - return; - - if (add_to) - launch_kernel(_cuda_scale_channels_add_to,max_jobs(dest.size()), - dest.device(), src.device(), src.size(), scales.device(), src.nr()*src.nc()); - else - launch_kernel(_cuda_scale_channels,max_jobs(dest.size()), - dest.device_write_only(), src.device(), src.size(), scales.device(), src.nr()*src.nc()); - } - - // ------------------------------------------------------------------------------------ - - __global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = s1[i]*s2[i]; - } - } - - __global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] += s1[i]*s2[i]; - } - } - - __global__ void _cuda_mult2(float* d, const float* s1, const float* s2, - size_t dn, size_t dk, size_t dr, size_t dc, - size_t s1n, size_t s1k, size_t s1r, size_t s1c, - size_t s2n, size_t s2k, size_t s2r, size_t s2c) - { - for (auto i : grid_stride_range(0, dn*dk*dr*dc)) - { - size_t n,k,r,c; - unpack_idx(i, dk,dr,dc, n,k,r,c); - - float v1 = 0; - float v2 = 0; - - if (n < s1n && - k < s1k && - r < s1r && - c < s1c ) - { - v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; - } - - if (n < s2n && - k < s2k && - r < s2r && - c < s2c ) - { - v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; - } - - d[i] = v1*v2; - } - } - - __global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2, - size_t dn, size_t dk, size_t dr, size_t dc, - size_t s1n, size_t s1k, size_t s1r, size_t s1c, - size_t s2n, size_t s2k, size_t s2r, size_t s2c) - { - for (auto i : grid_stride_range(0, dn*dk*dr*dc)) - { - size_t n,k,r,c; - unpack_idx(i, dk,dr,dc, n,k,r,c); - - float v1 = 0; - float v2 = 0; - - if (n < s1n && - k < s1k && - r < s1r && - c < s1c ) - { - v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; - } - - if (n < s2n && - k < s2k && - r < s2r && - c < s2c ) - { - v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; - } - - d[i] += v1*v2; - } - } - - void multiply_zero_padded ( - bool add_to, - tensor& dest, - const tensor& src1, - const tensor& src2 - ) - { - if (dest.size() == 0) - return; - - // Do the simple and fast version if everything has the same dimensions - if (have_same_dimensions(dest, src1) && - have_same_dimensions(dest, src2)) - { - if (add_to) - launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); - else - launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); - } - else - { - if (add_to) - { - // Otherwise, do the more complex version with bounds checking. - launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()), - dest.device(), src1.device(), src2.device(), - dest.num_samples(), dest.k(), dest.nr(), dest.nc(), - src1.num_samples(), src1.k(), src1.nr(), src1.nc(), - src2.num_samples(), src2.k(), src2.nr(), src2.nc() - ); - } - else - { - // Otherwise, do the more complex version with bounds checking. - launch_kernel(_cuda_mult2,max_jobs(dest.size()), - dest.device(), src1.device(), src2.device(), - dest.num_samples(), dest.k(), dest.nr(), dest.nc(), - src1.num_samples(), src1.k(), src1.nr(), src1.nc(), - src2.num_samples(), src2.k(), src2.nr(), src2.nc() - ); - } - } - } - - // ------------------------------------------------------------------------------------ - - __global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = s1[i]+s2[i]; - } - } - - __global__ void _cuda_add2(float* d, const float* s1, const float* s2, - size_t dn, size_t dk, size_t dr, size_t dc, - size_t s1n, size_t s1k, size_t s1r, size_t s1c, - size_t s2n, size_t s2k, size_t s2r, size_t s2c) - { - for (auto i : grid_stride_range(0, dn*dk*dr*dc)) - { - size_t n,k,r,c; - unpack_idx(i, dk,dr,dc, n,k,r,c); - - float v1 = 0; - float v2 = 0; - - if (n < s1n && - k < s1k && - r < s1r && - c < s1c ) - { - v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; - } - - if (n < s2n && - k < s2k && - r < s2r && - c < s2c ) - { - v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; - } - - d[i] = v1+v2; - } - } - - void add ( - tensor& dest, - const tensor& src1, - const tensor& src2 - ) - { - if (dest.size() == 0) - return; - - // Do the simple and fast version if everything has the same dimensions - if (have_same_dimensions(dest, src1) && - have_same_dimensions(dest, src2)) - { - launch_kernel(_cuda_add1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); - } - else - { - // Otherwise, do the more complex version with bounds checking. - launch_kernel(_cuda_add2,max_jobs(dest.size()), - dest.device(), src1.device(), src2.device(), - dest.num_samples(), dest.k(), dest.nr(), dest.nc(), - src1.num_samples(), src1.k(), src1.nr(), src1.nc(), - src2.num_samples(), src2.k(), src2.nr(), src2.nc() - ); - } - - } - - // ------------------------------------------------------------------------------------ - - __global__ void _cuda_affine_transform1(float* d, const float* s, size_t n, float A, float B) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = A*s[i] + B; - } - } - - __global__ void _cuda_affine_transform1_0(float* d, const float* s, size_t n, float A) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = A*s[i]; - } - } - - void affine_transform( - tensor& dest, - const tensor& src, - const float A, - const float B - ) - { - DLIB_CASSERT(dest.size()==src.size()); - if (B != 0) - launch_kernel(_cuda_affine_transform1,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A, B); - else - launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A); - } - - void affine_transform( - tensor& dest, - const tensor& src, - const float A - ) - { - DLIB_CASSERT(dest.size()==src.size()); - launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_affine_transform_rect( - float* d, - const float* s1, - const float* s2, - const float* s3, - float A, - float B, - float C, - size_t start_idx, - size_t n, - size_t rect_nc, - size_t total_nc - ) - { - for (auto i : grid_stride_range(0, n)) - { - size_t r = i/rect_nc; - size_t c = i%rect_nc; - size_t idx = r*total_nc + c + start_idx; - d[idx] = A*s1[idx] + B*s2[idx] + C*s3[idx]; - } - } - - void affine_transform( - const rectangle& rect, - tensor& dest, - const tensor& src1, - const tensor& src2, - const tensor& src3, - float A, - float B, - float C - ) - { - DLIB_CASSERT(dest.size() == src1.size()); - DLIB_CASSERT(dest.size() == src2.size()); - DLIB_CASSERT(dest.size() == src3.size()); - DLIB_CASSERT(dest.num_samples() == src1.num_samples()); - DLIB_CASSERT(dest.num_samples() == src2.num_samples()); - DLIB_CASSERT(dest.num_samples() == src3.num_samples()); - DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect)); - launch_kernel(_cuda_affine_transform_rect,max_jobs(rect.area()), - dest.device(), src1.device(), src2.device(), src3.device(), A, B, C, - rect.left() + rect.top()*(dest.size()/dest.num_samples()), - rect.area(), - rect.width(), - dest.size()/dest.num_samples()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_affine_transform4(float* d, const float* s1, const float* s2, size_t n, float A, float B, float C) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = A*s1[i] + B*s2[i] + C; - } - } - - __global__ void _cuda_affine_transform4_0(float* d, const float* s1, const float* s2, size_t n, float A, float B) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = A*s1[i] + B*s2[i]; - } - } - - void affine_transform( - tensor& dest, - const tensor& src1, - const tensor& src2, - const float A, - const float B, - const float C - ) - { - DLIB_CASSERT(dest.size()==src1.size()); - DLIB_CASSERT(dest.size()==src2.size()); - if (C != 0) - launch_kernel(_cuda_affine_transform4,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B, C); - else - launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B); - } - - void affine_transform( - tensor& dest, - const tensor& src1, - const tensor& src2, - const float A, - const float B - ) - { - DLIB_CASSERT(dest.size()==src1.size()); - DLIB_CASSERT(dest.size()==src2.size()); - launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_add_scaled(float* d, const float* s, size_t n, float scale) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] += scale*s[i]; - } - } - - void add_scaled( - tensor& dest, - const float scale, - const tensor& src - ) - { - DLIB_CASSERT(dest.size()==src.size()); - launch_kernel(_cuda_add_scaled,max_jobs(dest.size()),dest.device(), src.device(), dest.size(), scale); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_add_cv_to_all_columns(float beta, float* dest, float alpha, const float* src, size_t size, size_t stride) - { - for (auto i : grid_stride_range(0, size)) - { - dest[i] = beta*dest[i] + alpha*src[i/stride]; - } - } - - __global__ void _cuda_add_cv_to_all_columns_no_beta(float* dest, float alpha, const float* src, size_t size, size_t stride) - { - for (auto i : grid_stride_range(0, size)) - { - dest[i] = alpha*src[i/stride]; - } - } - - void add_cv_to_all_columns( - float beta, - tensor& dest, - float alpha, - const tensor& src - ) - { - DLIB_CASSERT(dest.num_samples() == src.num_samples() && src.num_samples() == src.size()); - if (beta == 0) - launch_kernel(_cuda_add_cv_to_all_columns_no_beta, max_jobs(dest.size()), dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples()); - else - launch_kernel(_cuda_add_cv_to_all_columns, max_jobs(dest.size()), beta, dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_affine_transform5( - float* d, const float* s1, const float* s2, const float* s3, size_t n, float A, float B, float C, float D - ) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D; - } - } - - void affine_transform( - tensor& dest, - const tensor& src1, - const tensor& src2, - const tensor& src3, - const float A, - const float B, - const float C, - const float D - ) - { - DLIB_CASSERT(dest.size()==src1.size()); - DLIB_CASSERT(dest.size()==src2.size()); - DLIB_CASSERT(dest.size()==src3.size()); - launch_kernel(_cuda_affine_transform5,max_jobs(dest.size()),dest.device(), src1.device(), - src2.device(), src3.device(), dest.size(), A, B, C, D); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_affine_transform_range( - float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C - ) - { - for (auto i : grid_stride_range(begin, end)) - { - d[i] = A*s1[i] + B*s2[i] + C*s3[i]; - } - } - - - void affine_transform_range( - size_t begin, - size_t end, - tensor& dest, - const tensor& src1, - const tensor& src2, - const tensor& src3, - const float A, - const float B, - const float C - ) - { - DLIB_CASSERT(dest.size()==src1.size()); - DLIB_CASSERT(dest.size()==src2.size()); - DLIB_CASSERT(dest.size()==src3.size()); - DLIB_CASSERT(begin <= end && end <= dest.size()); - launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin), - dest.device(), src1.device(), - src2.device(), src3.device(), begin, end, A, B, C); - } - - // ----------------------------------------------------------------------------------- - - __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = A[i]*s[i] + B[i]; - } - } - __global__ void _cuda_affine_transform3(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = A[i%bs]*s[i] + B[i%bs]; - } - } - - void affine_transform( - tensor& dest, - const tensor& src, - const tensor& A, - const tensor& B - ) - { - DLIB_CASSERT(have_same_dimensions(dest, src)); - DLIB_CASSERT( - ((A.num_samples()==1 && B.num_samples()==1) || - (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples()))); - DLIB_CASSERT( - A.nr()==B.nr() && B.nr()==src.nr() && - A.nc()==B.nc() && B.nc()==src.nc() && - A.k() ==B.k() && B.k()==src.k(), - "\nA.nr(): " << A.nr() << "\nB.nr(): " << B.nr() << "\nsrc.nr(): " << src.nr() - <<"\nA.nc(): " << A.nc() << "\nB.nc(): " << B.nc() << "\nsrc.nc(): " << src.nc() - <<"\nA.k(): " << A.k() << "\nB.k(): " << B.k() << "\nsrc.k(): " << src.k() - ); - - if (A.num_samples() == 1) - { - launch_kernel(_cuda_affine_transform3,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device(), A.size()); - } - else - { - launch_kernel(_cuda_affine_transform2,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device()); - } - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_compute_adam_update( - size_t begin, - size_t end, - float* s, - float* m, - float* v, - const float alpha, - const float weight_decay, - const float momentum1, - const float momentum2, - const float* params, - const float* params_grad - ) - { - const float eps = 1e-8; - // The loop is equivalent to doing this: - // m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad); - // v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad); - // s = -alpha*m/(sqrt(v) + eps); - for (auto i : grid_stride_range(begin, end)) - { - float g = (weight_decay*params[i] + params_grad[i]); - m[i] = momentum1*m[i] + (1-momentum1)*g; - v[i] = momentum2*v[i] + (1-momentum2)*g*g; - s[i] = -alpha*m[i]/(std::sqrt(v[i]) + eps); - } - } - - void compute_adam_update ( - size_t begin, - size_t end, - tensor& s, - tensor& m, - tensor& v, - const float t, - const float learning_rate, - const float weight_decay, - const float momentum1, - const float momentum2, - const tensor& params, - const tensor& params_grad - ) - { - DLIB_CASSERT(s.size() == m.size() && - s.size() == v.size() && - s.size() == params.size() && - s.size() == params_grad.size()); - DLIB_CASSERT(begin <= end && end <= params.size()); - const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t)); - - launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin), - begin, end, s.device(), m.device(), v.device(), alpha, weight_decay, - momentum1, momentum2, params.device(), params_grad.device()); - } - - // ----------------------------------------------------------------------------------- - - __global__ void _cuda_affine_transform_conv(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs, size_t ks) - { - for (auto i : grid_stride_range(0, n)) - { - auto k = (i/bs)%ks; - d[i] = A[k]*s[i] + B[k]; - } - } - - void affine_transform_conv( - tensor& dest, - const tensor& src, - const tensor& A, - const tensor& B - ) - { - DLIB_CASSERT(have_same_dimensions(dest, src)); - DLIB_CASSERT(have_same_dimensions(A, B)); - DLIB_CASSERT(A.num_samples() == 1 && A.nr() == 1 && A.nc() == 1 && A.k() == src.k()); - - launch_kernel(_cuda_affine_transform_conv,max_jobs(dest.size()), - dest.device(), src.device(), src.size(), A.device(), B.device(), src.nr()*src.nc(), src.k()); - } - - // ----------------------------------------------------------------------------------- - - __global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n) - { - for (auto i : grid_stride_range(0, n)) - { - out[i] = in[i]; - for (size_t j = i+n; j < total_n; j+=n) - out[i] += in[j]; - } - } - - void assign_bias_gradient ( - tensor& grad, - const tensor& gradient_input - ) - { - DLIB_CASSERT( - grad.num_samples() == 1 && - gradient_input.k() == grad.k() && - gradient_input.nr() == grad.nr() && - gradient_input.nc() == grad.nc() && - gradient_input.size() > 0); - - launch_kernel(_add_bias_gradient,max_jobs(grad.size()),grad.device(), gradient_input.device(), grad.size(), gradient_input.size()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _set_tensor(float* out, size_t n, const float val) - { - for (auto i : grid_stride_range(0, n)) - out[i] = val; - } - - void set_tensor ( - tensor& t, - float value - ) - { - launch_kernel(_set_tensor, max_jobs(t.size()), t.device(), t.size(), value); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _scale_tensor(float* out, size_t n, const float val) - { - for (auto i : grid_stride_range(0, n)) - out[i] *= val; - } - - void scale_tensor ( - tensor& t, - float value - ) - { - launch_kernel(_scale_tensor, max_jobs(t.size()), t.device(), t.size(), value); - } - - // ----------------------------------------------------------------------------------- - // ----------------------------------------------------------------------------------- - - __global__ void _cuda_threshold(float* d, size_t n, float thresh) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = d[i]>thresh ? 1:0; - } - } - - void threshold ( - tensor& data, - float thresh - ) - { - launch_kernel(_cuda_threshold,max_jobs(data.size()),data.device(), data.size(), thresh); - } - - // ------------------------------------------------------------------------------------ - - __global__ void _cuda_dot(const float* a, const float* b, size_t n, float* result) - { - // Parallel sum everything into local temp variables. - float temp = 0; - for(auto i : grid_stride_range(0, n)) - temp += a[i]*b[i]; - - // Then do the warp reduce add thing to merge into one output value. - warp_reduce_atomic_add(*result, temp); - } - - - void dot ( - const tensor& a, - const tensor& b, - tensor& result, - size_t idx - ) - { - DLIB_CASSERT(a.size() == b.size()); - DLIB_CASSERT(idx < result.size()); - - launch_kernel(_cuda_dot, max_jobs(a.size()), a.device(), b.device(), a.size(), result.device()+idx); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_prelu(const float* s, float* d, size_t n, const float* pp) - { - const float p = *pp; - for (auto i : grid_stride_range(0, n)) - { - if (s[i] > 0) - d[i] = s[i]; - else - d[i] = p*s[i]; - } - } - - void prelu ( - tensor& dest, - const tensor& src, - const tensor& param - ) - { - launch_kernel(_cuda_prelu, max_jobs(dest.size()), - src.device(), dest.device(), src.size(), param.device()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_prelu_gradient(float* out, const float* s, const float* gi, size_t n, const float* pp, float* ppgrad) - { - const float p = *pp; - float pgrad = 0; - for(auto i : grid_stride_range(0, n)) - { - if (s[i] > 0) - { - out[i] += gi[i]; - } - else - { - out[i] += p*gi[i]; - pgrad += gi[i]*s[i]; - } - } - - // Then do the warp reduce add thing to merge into one output value. - warp_reduce_atomic_add(*ppgrad, pgrad); - } - - void prelu_gradient ( - tensor& grad, - const tensor& src, - const tensor& gradient_input, - const tensor& param, - tensor& params_grad - ) - { - params_grad = 0; - launch_kernel(_cuda_prelu_gradient, max_jobs(grad.size()), - grad.device(), src.device(), gradient_input.device(), grad.size(), - param.device(), params_grad.device()); - } - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_leaky_relu(const float* s, float* d, size_t n, const float alpha) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] > 0) - d[i] = s[i]; - else - d[i] = alpha * s[i]; - } - } - - void leaky_relu( - tensor& dest, - const tensor& src, - const float alpha - ) - { - launch_kernel(_cuda_leaky_relu, max_jobs(dest.size()), - src.device(), dest.device(), src.size(), alpha); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_leaky_relu_gradient_inplace(float* out, const float* s, const float* gi, size_t n, const float alpha) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] > 0) - out[i] = gi[i]; - else - out[i] = alpha * gi[i]; - } - } - - __global__ void _cuda_leaky_relu_gradient(float* out, const float* s, const float* gi, size_t n, const float alpha) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] > 0) - out[i] += gi[i]; - else - out[i] += alpha * gi[i]; - } - } - - void leaky_relu_gradient ( - tensor& grad, - const tensor& src, - const tensor& gradient_input, - const float alpha - ) - { - float* out = grad.device(); - const float* gi = gradient_input.device(); - if (out == gi) - { - launch_kernel(_cuda_leaky_relu_gradient_inplace, max_jobs(grad.size()), - out, src.device(), gi, grad.size(), alpha); - } - else - { - launch_kernel(_cuda_leaky_relu_gradient, max_jobs(grad.size()), - out, src.device(), gi, grad.size(), alpha); - } - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_mish(const float* s, float* d, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - const auto e = std::exp(s[i]); - const auto delta = 2*e + e*e + 2; - d[i] = s[i] - 2*s[i]/delta; - } - } - - void mish ( - tensor& dest, - const tensor& src - ) - { - launch_kernel(_cuda_mish, max_jobs(dest.size()), src.device(), dest.device(), src.size()); - } - - // ---------------------------------------------------------------------------------------- - - __device__ float mish_compute_gradient(float x) - { - if (x >= 8) - return 1.f; - if (x <= -8) - return 0.f; - - const auto e = std::exp(x); - const auto delta = 2*e + e*e + 2; - const auto omega = 4*(x + 1) + 4*e*e + e*e*e + e*(4*x + 6); - return e*omega/(delta*delta); - } - - __global__ void _cuda_mish_gradient_inplace(float* out, const float* s, const float* gi, size_t n) - { - for (auto i : grid_stride_range(0, n)) - out[i] = gi[i]*mish_compute_gradient(s[i]); - } - - __global__ void _cuda_mish_gradient(float* out, const float* s, const float* gi, size_t n) - { - for (auto i : grid_stride_range(0, n)) - out[i] += gi[i]*mish_compute_gradient(s[i]); - } - - void mish_gradient ( - tensor& grad, - const tensor& src, - const tensor& gradient_input - ) - { - float* out = grad.device(); - const float* gi = gradient_input.device(); - if (out == gi) - launch_kernel(_cuda_mish_gradient_inplace, max_jobs(grad.size()), out, src.device(), gi, grad.size()); - else - launch_kernel(_cuda_mish_gradient, max_jobs(grad.size()), out, src.device(), gi, grad.size()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_clipped_relu(const float* s, float* d, size_t n, const float alpha) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] < 0) - d[i] = 0; - else if (s[i] > alpha) - d[i] = alpha; - else - d[i] = s[i]; - } - } - - void clipped_relu ( - tensor& dest, - const tensor &src, - const float alpha - ) - { - launch_kernel(_cuda_clipped_relu, max_jobs(dest.size()), - src.device(), dest.device(), src.size(), alpha); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_clipped_relu_gradient_inplace(float* out, const float* s, const float* gi, size_t n, const float alpha) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] > 0 && s[i] < alpha) - out[i] = gi[i]; - else - out[i] = 0.f; - } - } - - __global__ void _cuda_clipped_relu_gradient(float* out, const float* s, const float* gi, size_t n, const float alpha) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] > 0 && s[i] < alpha) - out[i] += gi[i]; - } - } - - void clipped_relu_gradient ( - tensor& grad, - const tensor& dest, - const tensor& gradient_input, - const float alpha - ) - { - float* out = grad.device(); - const float* gi = gradient_input.device(); - if (out == gi) - launch_kernel(_cuda_clipped_relu_gradient_inplace, max_jobs(grad.size()), out, dest.device(), gi, grad.size(), alpha); - else - launch_kernel(_cuda_clipped_relu_gradient, max_jobs(grad.size()), out, dest.device(), gi, grad.size(), alpha); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_elu(const float* s, float* d, size_t n, const float alpha) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] > 0) - d[i] = s[i]; - else - d[i] = alpha * (std::exp(s[i]) - 1.0f); - } - } - - void elu ( - tensor& dest, - const tensor &src, - const float alpha - ) - { - launch_kernel(_cuda_elu, max_jobs(dest.size()), src.device(), dest.device(), src.size(), alpha); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_elu_gradient_inplace(float* out, const float* s, const float* gi, size_t n, const float alpha) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] > 0) - out[i] = gi[i]; - else - out[i] = (alpha + s[i]) * gi[i]; - } - } - - __global__ void _cuda_elu_gradient(float* out, const float* s, const float* gi, size_t n, const float alpha) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] > 0) - out[i] += gi[i]; - else - out[i] += (alpha + s[i]) * gi[i]; - } - } - - void elu_gradient ( - tensor& grad, - const tensor& dest, - const tensor& gradient_input, - const float alpha - ) - { - float* out = grad.device(); - const float* gi = gradient_input.device(); - if (out == gi) - launch_kernel(_cuda_elu_gradient_inplace, max_jobs(grad.size()), out, dest.device(), gi, grad.size(), alpha); - else - launch_kernel(_cuda_elu_gradient, max_jobs(grad.size()), out, dest.device(), gi, grad.size(), alpha); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_gelu(const float* s, float* d, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = s[i] * normcdf(s[i]); - } - } - - void gelu ( - tensor& dest, - const tensor& src - ) - { - launch_kernel(_cuda_gelu, max_jobs(dest.size()), src.device(), dest.device(), src.size()); - } - - // ---------------------------------------------------------------------------------------- - - __device__ float gelu_compute_gradient(float x) - { - const float beta = 1.0f / CUDART_SQRT_2PI; - const float cdf = normcdf(x); - const float pdf = beta*std::exp(-0.5f*x*x); - return cdf + x * pdf; - } - - __global__ void _cuda_gelu_gradient_inplace(float* out, const float* s, const float* gi, size_t n) - { - for (auto i : grid_stride_range(0, n)) - out[i] = gi[i]*gelu_compute_gradient(s[i]); - } - - __global__ void _cuda_gelu_gradient(float* out, const float* s, const float* gi, size_t n) - { - for (auto i : grid_stride_range(0, n)) - out[i] += gi[i]*gelu_compute_gradient(s[i]); - } - - void gelu_gradient ( - tensor& grad, - const tensor& src, - const tensor& gradient_input - ) - { - float* out = grad.device(); - const float* gi = gradient_input.device(); - if (out == gi) - launch_kernel(_cuda_gelu_gradient_inplace, max_jobs(grad.size()), out, src.device(), gi, grad.size()); - else - launch_kernel(_cuda_gelu_gradient, max_jobs(grad.size()), out, src.device(), gi, grad.size()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_smelu (const float* s, float* d, size_t n, const float beta) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] >= beta) - d[i] = s[i]; - else if (s[i] <= -beta) - d[i] = 0; - else - d[i] = (s[i] + beta) * (s[i] + beta) / (4 * beta); - } - } - - void smelu ( - tensor& dest, - const tensor& src, - const float beta - ) - { - launch_kernel(_cuda_smelu, max_jobs(dest.size()), src.device(), dest.device(), src.size(), beta); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_smelu_gradient_inplace(float* out, const float* s, const float* gi, size_t n, const float beta) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] >= beta) - out[i] = gi[i]; - else if (s[i] == 0) - out[i] = 0; - else - out[i] = std::sqrt(beta * s[i]) / beta * gi[i]; - } - } - - __global__ void _cuda_smelu_gradient(float* out, const float* s, const float* gi, size_t n, const float beta) - { - for (auto i : grid_stride_range(0, n)) - { - if (s[i] >= beta) - out[i] += gi[i]; - else if (s[i] == 0) - continue; - else - out[i] += std::sqrt(beta * s[i]) / beta * gi[i]; - } - } - - void smelu_gradient ( - tensor& grad, - const tensor& src, - const tensor& gradient_input, - const float beta - ) - { - float* out = grad.device(); - const float* gi = gradient_input.device(); - if (out == gi) - { - launch_kernel(_cuda_smelu_gradient_inplace, max_jobs(grad.size()), - out, src.device(), gi, grad.size(), beta); - } - else - { - launch_kernel(_cuda_smelu_gradient, max_jobs(grad.size()), - out, src.device(), gi, grad.size(), beta); - } - } - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_silu(const float* s, float* d, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - d[i] = s[i] / (1.0f + std::exp(-s[i])); - } - } - - void silu ( - tensor& dest, - const tensor& src - ) - { - launch_kernel(_cuda_silu, max_jobs(dest.size()), src.device(), dest.device(), src.size()); - } - - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_silu_gradient_inplace(float* out, const float* s, const float* gi, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - const auto sig_s = 1.0f / (1.0f + std::exp(-s[i])); - out[i] = gi[i] * (sig_s * (1.0f + s[i] * (1.0f - sig_s))); - } - } - - __global__ void _cuda_silu_gradient(float* out, const float* s, const float* gi, size_t n) - { - for (auto i : grid_stride_range(0, n)) - { - const auto sig_s = 1.0f / (1.0f + std::exp(-s[i])); - out[i] += gi[i] * (sig_s * (1.0f + s[i] * (1.0f - sig_s))); - } - } - - void silu_gradient ( - tensor& grad, - const tensor& src, - const tensor& gradient_input - ) - { - float* out = grad.device(); - const float* gi = gradient_input.device(); - if (out == gi) - launch_kernel(_cuda_silu_gradient_inplace, max_jobs(grad.size()), out, src.device(), gi, grad.size()); - else - launch_kernel(_cuda_silu_gradient, max_jobs(grad.size()), out, src.device(), gi, grad.size()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_resize_bilinear(size_t dsize, size_t dchan_size, size_t dnc, float* d, - size_t schan_size, int snr, int snc, const float* s, - const float x_scale, const float y_scale) - { - for(auto i : grid_stride_range(0, dsize)) - { - const int idx = i%dchan_size; - const int channel = i/dchan_size; - const int sidx = channel*schan_size; - const int r = idx/dnc; - const int c = idx%dnc; - - const float y = r*y_scale; - const int top = static_cast(::floorf(y)); - const int bottom = ::min(top+1, snr-1); - const float tb_frac = y - top; - - const float x = c*x_scale; - const int left = static_cast(::floorf(x)); - const int right = ::min(left+1, snc-1); - const float lr_frac = x - left; - - float tl = s[sidx+top*snc+left]; - float tr = s[sidx+top*snc+right]; - float bl = s[sidx+bottom*snc+left]; - float br = s[sidx+bottom*snc+right]; - - float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) + - tb_frac*((1-lr_frac)*bl + lr_frac*br); - - d[i] = temp; - } - } - - __global__ void _cuda_resize_bilinear_strided(size_t dsize, size_t dchan_size, size_t dnc, float* d, - size_t schan_size, int snr, int snc, const float* s, - const float x_scale, const float y_scale, - size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided - ) - { - for(auto i : grid_stride_range(0, dsize)) - { - const int idx = i%dchan_size; - const int channel = i/dchan_size; - const int sidx = channel*schan_size; - const int r = idx/dnc; - const int c = idx%dnc; - const int didx = channel*dest_chan_size_strided + r*dest_row_stride+c; - - const float y = r*y_scale; - const int top = static_cast(::floorf(y)); - const int bottom = ::min(top+1, snr-1); - const float tb_frac = y - top; - - const float x = c*x_scale; - const int left = static_cast(::floorf(x)); - const int right = ::min(left+1, snc-1); - const float lr_frac = x - left; - - float tl = s[sidx+top*src_row_stride+left]; - float tr = s[sidx+top*src_row_stride+right]; - float bl = s[sidx+bottom*src_row_stride+left]; - float br = s[sidx+bottom*src_row_stride+right]; - - float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) + - tb_frac*((1-lr_frac)*bl + lr_frac*br); - - d[didx] = temp; - } - } - - void resize_bilinear ( - tensor& dest, - long long dest_row_stride, - long long dest_channel_stride, - const tensor& src, - long long src_row_stride, - long long src_channel_stride - ) - { - DLIB_CASSERT(is_same_object(dest, src)==false); - DLIB_CASSERT(dest.num_samples() == src.num_samples()); - DLIB_CASSERT(dest.k() == src.k()); - - if (dest.size() == 0 || src.size() == 0) - return; - - const float x_scale = (src.nc()-1)/(float)std::max((dest.nc()-1),1); - const float y_scale = (src.nr()-1)/(float)std::max((dest.nr()-1),1); - - if (dest.nc() == dest_row_stride && dest.nr()*dest.nc()==dest_channel_stride && - src.nc() == src_row_stride && src.nr()*src.nc()==src_channel_stride) - { - launch_kernel(_cuda_resize_bilinear, - dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(), - src.nr()*src.nc(), src.nr(), src.nc(), src.device(), - x_scale, y_scale); - } - else - { - launch_kernel(_cuda_resize_bilinear_strided, - dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(), - src_channel_stride, src.nr(), src.nc(), src.device(), - x_scale, y_scale, dest_row_stride, src_row_stride, dest_channel_stride); - } - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_resize_bilinear_gradient(size_t dsize, size_t dchan_size, size_t dnc, const float* d, - size_t schan_size, int snr, int snc, float* s, - const float x_scale, const float y_scale) - { - for(auto i : grid_stride_range(0, dsize)) - { - const float tmp = d[i]; - - const int idx = i%dchan_size; - const int channel = i/dchan_size; - const int sidx = channel*schan_size; - const int r = idx/dnc; - const int c = idx%dnc; - - const float y = r*y_scale; - const int top = static_cast(::floorf(y)); - const int bottom = ::min(top+1, snr-1); - const float tb_frac = y - top; - - const float x = c*x_scale; - const int left = static_cast(::floorf(x)); - const int right = ::min(left+1, snc-1); - const float lr_frac = x - left; - - - atomicAdd(s+sidx+top*snc+left, tmp*(1-tb_frac)*(1-lr_frac)); - atomicAdd(s+sidx+top*snc+right, tmp*(1-tb_frac)*(lr_frac)); - atomicAdd(s+sidx+bottom*snc+left, tmp*(tb_frac)*(1-lr_frac)); - atomicAdd(s+sidx+bottom*snc+right, tmp*(tb_frac)*(lr_frac)); - } - } - - __global__ void _cuda_resize_bilinear_gradient_strided(size_t dsize, size_t dchan_size, size_t dnc, const float* d, - size_t schan_size, int snr, int snc, float* s, - const float x_scale, const float y_scale, - size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided - ) - { - for(auto i : grid_stride_range(0, dsize)) - { - - const int idx = i%dchan_size; - const int channel = i/dchan_size; - const int didx = channel*dest_chan_size_strided; - const int sidx = channel*schan_size; - const int r = idx/dnc; - const int c = idx%dnc; - - const float tmp = d[didx + r*dest_row_stride+c]; - - const float y = r*y_scale; - const int top = static_cast(::floorf(y)); - const int bottom = ::min(top+1, snr-1); - const float tb_frac = y - top; - - const float x = c*x_scale; - const int left = static_cast(::floorf(x)); - const int right = ::min(left+1, snc-1); - const float lr_frac = x - left; - - - atomicAdd(s+sidx+top*src_row_stride+left, tmp*(1-tb_frac)*(1-lr_frac)); - atomicAdd(s+sidx+top*src_row_stride+right, tmp*(1-tb_frac)*(lr_frac)); - atomicAdd(s+sidx+bottom*src_row_stride+left, tmp*(tb_frac)*(1-lr_frac)); - atomicAdd(s+sidx+bottom*src_row_stride+right, tmp*(tb_frac)*(lr_frac)); - } - } - - void resize_bilinear_gradient ( - tensor& grad, - long long grad_row_stride, - long long grad_channel_stride, - const tensor& gradient_input, - long long gradient_input_row_stride, - long long gradient_input_channel_stride - ) - { - DLIB_CASSERT(is_same_object(grad, gradient_input)==false); - DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples()); - DLIB_CASSERT(gradient_input.k() == grad.k()); - - if (grad.size() == 0 || gradient_input.size() == 0) - return; - - const float x_scale = (grad.nc()-1)/(float)std::max((gradient_input.nc()-1),1); - const float y_scale = (grad.nr()-1)/(float)std::max((gradient_input.nr()-1),1); - - if (grad.nc() == grad_row_stride && grad.nr()*grad.nc()==grad_channel_stride && - gradient_input.nc() == gradient_input_row_stride && gradient_input.nr()*gradient_input.nc()==gradient_input_channel_stride) - { - launch_kernel(_cuda_resize_bilinear_gradient, - gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(), - grad.nr()*grad.nc(), grad.nr(), grad.nc(), grad.device(), - x_scale, y_scale); - } - else - { - launch_kernel(_cuda_resize_bilinear_gradient_strided, - gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(), - grad_channel_stride, grad.nr(), grad.nc(), grad.device(), - x_scale, y_scale, gradient_input_row_stride, grad_row_stride, gradient_input_channel_stride); - } - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_reorg(size_t dsize, size_t dk, size_t dnr, size_t dnc, float* d, - size_t sk, size_t snr, int snc, const float* s, - const size_t row_stride, const size_t col_stride, const bool add_to) - { - const auto out_plane_size = dnr * dnc; - const auto out_sample_size = dk * out_plane_size; - for (auto i : grid_stride_range(0, dsize)) - { - const auto n = i / out_sample_size; - const auto out_idx = i % out_sample_size; - const auto out_k = out_idx / out_plane_size; - const auto out_rc = out_idx % out_plane_size; - const auto out_r = out_rc / dnc; - const auto out_c = out_rc % dnc; - - const auto in_k = out_k % sk; - const auto in_r = out_r * row_stride + (out_k / sk) / col_stride; - const auto in_c = out_c * col_stride + (out_k / sk) % col_stride; - - const auto in_idx = ((n * sk + in_k) * snr + in_r) * snc + in_c; - if (add_to) d[i] += s[in_idx]; - else d[i] = s[in_idx]; - } - } - - __global__ void _cuda_reorg_gradient(size_t ssize, size_t dk, size_t dnr, size_t dnc, float* d, - size_t sk, size_t snr, int snc, const float* s, const size_t row_stride, - const size_t col_stride, const bool add_to - ) - { - for(auto i : grid_stride_range(0, ssize)) - { - const auto n = i / (sk * snr * snc); - const auto sample_idx = i % (sk * snr * snc); - const auto in_k = (sample_idx / (snr * snc)) % sk; - const auto in_r = (sample_idx / snc) % snr; - const auto in_c = sample_idx % snc; - - const auto out_k = in_k % dk; - const auto out_r = in_r * row_stride + (in_k / dk) / col_stride; - const auto out_c = in_c * col_stride + (in_k / dk) % col_stride; - const auto out_idx = ((n * dk + out_k) * dnr + out_r) * dnc + out_c; - - if (add_to) d[out_idx] += s[i]; - else d[out_idx] = s[i]; - } - } - - void reorg( - bool add_to, - tensor& dest, - const int row_stride, - const int col_stride, - const tensor& src - ) - { - DLIB_CASSERT(!is_same_object(dest, src), "Destination and source must be distinct objects."); - DLIB_CASSERT(src.nr() % row_stride == 0, "The number of rows in src must be divisible by row_stride."); - DLIB_CASSERT(src.nc() % col_stride == 0, "The number of columns in src must be divisible by col_stride."); - DLIB_CASSERT(dest.num_samples() == src.num_samples(), "The number of samples must match."); - DLIB_CASSERT(dest.k() == src.k() * row_stride * col_stride, "The number of channels must match."); - DLIB_CASSERT(dest.nr() == src.nr() / row_stride, "The number of rows must match."); - DLIB_CASSERT(dest.nc() == src.nc() / col_stride, "The number of columns must match."); - - launch_kernel(_cuda_reorg, dest.size(), dest.k(), dest.nr(), dest.nc(), dest.device(), - src.k(), src.nr(), src.nc(), src.device(), row_stride, col_stride, add_to); - } - - void reorg_gradient( - bool add_to, - tensor& grad, - const int row_stride, - const int col_stride, - const tensor& gradient_input - ) - { - DLIB_CASSERT(!is_same_object(grad, gradient_input), "Grad and gradient_input must be distinct objects."); - DLIB_CASSERT(grad.nr() % row_stride == 0, "The number of rows in grad must be divisible by row_stride."); - DLIB_CASSERT(grad.nc() % col_stride == 0, "The number of columns in grad must be divisible by col_stride."); - DLIB_CASSERT(grad.num_samples() == gradient_input.num_samples(), "The number of samples in grad and gradient_input must match."); - DLIB_CASSERT(grad.k() == gradient_input.k() / row_stride / col_stride, "The number of channels in grad must be gradient_input.k() divided by row_stride and col_stride."); - DLIB_CASSERT(grad.nr() == gradient_input.nr() * row_stride, "The number of rows in grad must be gradient_input.nr() multiplied by row_stride."); - DLIB_CASSERT(grad.nc() == gradient_input.nc() * col_stride, "The number of columns in grad must be gradient_input.nc() multiplied by col_stride."); - - launch_kernel(_cuda_reorg_gradient, gradient_input.size(), grad.k(), grad.nr(), grad.nc(), grad.device(), - gradient_input.k(), gradient_input.nr(), gradient_input.nc(), gradient_input.device(), - row_stride, col_stride, add_to); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_embeddings(size_t dsize, size_t dk, size_t dr, size_t dc, - float* d, const float* s, const float* e, size_t es - ) - { - for (auto i : grid_stride_range(0, dsize)) - { - const auto n = i / (dk * dr * dc); - const auto s_idx = i % (dk * dr * dc); - const auto k = (s_idx / (dr * dc)) % dk; - const auto r = (s_idx / dc) % dr; - const auto c = s_idx % dc; - - const unsigned long t_idx = static_cast(s[(n * dk + k) * dr + r]); - - if (t_idx < es) - d[i] = e[t_idx * dc + c]; - else - d[i] = 0.0f; - } - } - - void embeddings( - resizable_tensor& dest, - const tensor& src, - const tensor& embs - ) - { - DLIB_CASSERT( - src.nr() > 0 && - embs.num_samples() > 0 && - embs.k() > 0 && - embs.nr() == 1 && - embs.nc() == 1, - "\nsrc.num_samples(): " << src.num_samples() << - "\nsrc.k(): " << src.k() << - "\nsrc.nr(): " << src.nr() << - "\nsrc.nc(): " << src.nc() << - "\nembs.num_samples(): " << embs.num_samples() << - "\nembs.k(): " << embs.k() << - "\nembs.nr(): " << embs.nr() << - "\nembs.nc(): " << embs.nc() - ); - - const long dk = dest.k(); - const long dr = dest.nr(); - const long dc = dest.nc(); - - launch_kernel(_cuda_embeddings, dest.size(), dk, dr, dc, - dest.device(), src.device(), embs.device(), embs.num_samples()); - } - - __global__ void _cuda_embeddings_gradient(size_t ssize, size_t sk, size_t sr, size_t sc, - const float* o, const float* gi, float* g, const float* f, float lr, bool sl, size_t es - ) - { - for (auto i : grid_stride_range(0, ssize)) - { - const auto n = i / (sk * sr * sc); - const auto s_idx = i % (sk * sr * sc); - const auto k = (s_idx / (sr * sc)) % sk; - const auto r = (s_idx / sc) % sr; - const auto c = s_idx % sc; - - const unsigned long t_idx = static_cast(o[(n * sk + k) * sr + r]); - if (t_idx < es) - { - const float f_t = f[t_idx]; - float f_s = 1.0f; - - if (sl && f_t != 0.0f) f_s = fminf(0.15f, fmaxf(1.0f / f_t, 1.0f)); - if (f_t > 1) atomicAdd(&g[t_idx * sc + c], -gi[i] * lr * f_s); - else g[t_idx * sc + c] -= gi[i] * lr * f_s; - } - } - } - - void embeddings_gradient( - const tensor& prev, - const tensor& gradient_input, - tensor& grads, - const tensor& freqs, - float learning_rate, - bool scale - ) - { - DLIB_CASSERT( - prev.nr() > 0 && - gradient_input.num_samples() == prev.num_samples() && - gradient_input.k() == prev.k() && - gradient_input.nr() == prev.nr() && - gradient_input.nc() == grads.k() && - grads.num_samples() > 0 && - grads.k() > 0 && - grads.nr() == 1 && - grads.nc() == 1, - "\ngradient_input.num_samples(): " << gradient_input.num_samples() << - "\ngradient_input.k(): " << gradient_input.k() << - "\ngradient_input.nr(): " << gradient_input.nr() << - "\ngradient_input.nc(): " << gradient_input.nc() << - "\nprev.num_samples(): " << prev.num_samples() << - "\nprev.k(): " << prev.k() << - "\nprev.nr(): " << prev.nr() << - "\nprev.nc(): " << prev.nc() << - "\ngrads.num_samples(): " << grads.num_samples() << - "\ngrads.k(): " << grads.k() << - "\ngrads.nr(): " << grads.nr() << - "\ngrads.nc(): " << grads.nc() - ); - - const long sk = gradient_input.k(); - const long sr = gradient_input.nr(); - const long sc = gradient_input.nc(); - - launch_kernel(_cuda_embeddings_gradient, gradient_input.size(), sk, sr, sc, - prev.device(), gradient_input.device(), grads.device(), freqs.device(), - learning_rate, scale, grads.num_samples()); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_layer_normalize( - float* out, - const float* s, - float* m, - float* v, - const float* g, - const float* b, - float eps, - size_t ns, - size_t k, - size_t num - ) - { - // compute means and sum of squares - for (auto n : grid_stride_range_y(0, ns)) - { - const auto ps = s + n * k * num; - float means = 0; - float invstds = 0; - for (auto i : grid_stride_range(0, k * num)) - { - means += ps[i]; - invstds += ps[i] * ps[i]; - } - warp_reduce_atomic_add(m[n], means / (k * num)); - warp_reduce_atomic_add(v[n], invstds / (k * num)); - } - __syncthreads(); - - // compute variances - for (auto n : grid_stride_range_y(0, ns)) - { - for (auto i : grid_stride_range(0, 1)) - { - v[n] = 1.0f / std::sqrt(v[n] - m[n] * m[n] + eps); - } - } - __syncthreads(); - - for (auto n : grid_stride_range_y(0, ns)) - { - const auto ps = s + n * k * num; - const auto pout = out + n * k * num; - for (auto i : grid_stride_range(0, k * num)) - { - pout[i] = (ps[i] - m[n]) * v[n]; - pout[i] = pout[i] * g[i / num] + b[i / num]; - } - } - } - - void layer_normalize ( - const double eps, - resizable_tensor& dest, - resizable_tensor& means, - resizable_tensor& invstds, - const tensor& src, - const tensor& gamma, - const tensor& beta - ) - { - const long num = src.nr() * src.nc(); - DLIB_CASSERT( - have_same_dimensions(gamma, beta) && - gamma.k() == src.k() && - gamma.nr() == 1 && - gamma.nc() == 1 && - eps > 0, - "\nsrc.k(): " << src.k() << - "\ngamma.k(): " << gamma.k() << - "\ngamma.nr(): " << gamma.nr() << - "\ngamma.nc(): " << gamma.nc() << - "\nbeta.k(): " << beta.k() << - "\nbeta.nr(): " << beta.nr() << - "\nbeta.nc(): " << beta.nc() << - "\neps: " << eps - ); - - dest.copy_size(src); - means.set_size(src.num_samples()); - invstds.set_size(src.num_samples()); - means = 0; - invstds = 0; - launch_kernel(_cuda_layer_normalize, max_jobs(src.k() * num, src.num_samples()), dest.device(), src.device(), - means.device(), invstds.device(), gamma.device(), beta.device(), eps, src.num_samples(), src.k(), num); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_layer_normalize_gradient( - float* out, - float* gg, - float* bg, - const float* s, - const float* gi, - const float* m, - const float* v, - const float* g, - float* dm, - float* dv, - float eps, - size_t ns, - size_t ks, - size_t num) - { - for (auto nk : grid_stride_range_y(0, ns * ks)) - { - const auto n = nk / ks; - const auto k = nk % ks; - const auto ps = s + (n * ks + k) * num; - const auto pgi = gi + (n * ks + k) * num; - const float invstd_pow = -0.5 * std::pow(v[n], 3.0f); - float temp_bg = 0; - float temp_gg = 0; - float temp_dv = 0; - for (auto i : grid_stride_range(0, num)) - { - const float x_hat = (ps[i] - m[n]) * v[n]; - const float dx = pgi[i] * g[i / num]; - temp_bg += pgi[i]; - temp_gg += pgi[i] * x_hat; - temp_dv += dx * (ps[i] - m[n]) * invstd_pow; - } - warp_reduce_atomic_add(bg[k], temp_bg); - warp_reduce_atomic_add(gg[k], temp_gg); - warp_reduce_atomic_add(dv[n], temp_dv); - } - __syncthreads(); - - const float invnum = 1.0f / (ks * num); - for (auto n : grid_stride_range_y(0, ns)) - { - const auto ps = s + n * ks * num; - const auto pgi = gi + n * ks * num; - float temp_dm = 0; - for (auto i : grid_stride_range(0, ks * num)) - { - const float dx = pgi[i] * g[i / num]; - temp_dm += -dx * v[n] + dv[n] * -2 * (ps[i] - m[n]) * invnum; - } - warp_reduce_atomic_add(dm[n], temp_dm); - } - __syncthreads(); - - for (auto n : grid_stride_range_y(0, ns)) - { - const auto ps = s + n * ks * num; - const auto pgi = gi + n * ks * num; - const auto pout = out + n * ks * num; - for (auto i : grid_stride_range(0, ks * num)) - { - const float dx = pgi[i] * g[i / num]; - pout[i] += dx * v[n] + dv[n] * 2 * (ps[i] - m[n]) * invnum + dm[n] * invnum; - } - } - } - - void layer_normalize_gradient ( - const double eps, - const tensor& gradient_input, - const tensor& means, - const tensor& invstds, - const tensor& src, - const tensor& gamma, - tensor& src_grad, - tensor& gamma_grad, - tensor& beta_grad, - resizable_tensor& dmeans, - resizable_tensor& dvars - ) - { - const long num = src.nr() * src.nc(); - DLIB_CASSERT(src.num_samples() == means.size()); - DLIB_CASSERT(src.num_samples() == invstds.size()); - DLIB_CASSERT(have_same_dimensions(gamma, gamma_grad)); - DLIB_CASSERT(have_same_dimensions(gamma_grad, beta_grad)); - DLIB_CASSERT(gamma.k() == src.k()); - DLIB_CASSERT(gamma.nr() == 1); - DLIB_CASSERT(gamma.nc() == 1); - DLIB_CASSERT(have_same_dimensions(gradient_input, src)); - DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad)); - DLIB_CASSERT(eps > 0); - - beta_grad = 0; - gamma_grad = 0; - dvars.copy_size(invstds); - dmeans.copy_size(means); - dvars = 0; - dmeans = 0; - launch_kernel(_cuda_layer_normalize_gradient, max_jobs(src.k() * num, src.num_samples()), - src_grad.device(), gamma_grad.device(), beta_grad.device(), src.device(), - gradient_input.device(), means.device(), invstds.device(), gamma.device(), - dmeans.device(), dvars.device(), eps, src.num_samples(), src.k(), num); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_rms_normalize( - float* dest, - float* scale, - const float* src, - const float* gamma, - float eps, - size_t ns, - size_t ks, - size_t num - ) - { - for (auto n : grid_stride_range_y(0, ns)) - { - const auto ps = src + n * ks * num; - float sum_squares = 0.0f; - for (auto i : grid_stride_range(0, ks * num)) - { - sum_squares += ps[i] * ps[i]; - } - warp_reduce_atomic_add(scale[n], sum_squares / (ks * num)); - } - __syncthreads(); - - for (auto n : grid_stride_range_y(0, ns)) - { - for (auto i : grid_stride_range(0, 1)) - { - scale[n] = 1.0f / std::sqrt(scale[n] + eps); - } - } - __syncthreads(); - - for (auto n : grid_stride_range_y(0, ns)) - { - const auto ps = src + n * ks * num; - const auto pd = dest + n * ks * num; - for (auto i : grid_stride_range(0, ks * num)) - { - pd[i] = ps[i] * scale[n] * gamma[i / num]; - } - } - } - - void rms_normalize( - const double eps, - resizable_tensor& dest, - resizable_tensor& scale, - const tensor& src, - const tensor& gamma - ) - { - DLIB_CASSERT( - gamma.k() == src.k() && - gamma.nr() == 1 && - gamma.nc() == 1 && - eps > 0, - "\nsrc.k(): " << src.k() << - "\ngamma.k(): " << gamma.k() << - "\ngamma.nr(): " << gamma.nr() << - "\ngamma.nc(): " << gamma.nc() << - "\neps: " << eps - ); - - const long ns = src.num_samples(); - const long ks = src.k(); - const long num = src.nr() * src.nc(); - - dest.copy_size(src); - scale.set_size(ns); - scale = 0; - - launch_kernel(_cuda_rms_normalize, max_jobs(ks * num, ns), - dest.device(), scale.device(), src.device(), gamma.device(), eps, ns, ks, num); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_rms_normalize_gradient( - float* src_grad, - float* gamma_grad, - float* dscale, - const float* src, - const float* gradient_input, - const float* scale, - const float* gamma, - size_t ns, - size_t ks, - size_t num - ) - { - for (auto nk : grid_stride_range_y(0, ns * ks)) - { - const auto n = nk / ks; - const auto k = nk % ks; - const auto ps = src + (n * ks + k) * num; - const auto pgi = gradient_input + (n * ks + k) * num; - const float scale_pow = -0.5f * std::pow(scale[n], 3.0f); - float temp_gg = 0.0f; - float temp_ds = 0.0f; - for (auto i : grid_stride_range(0, num)) - { - const float x_hat = ps[i] * scale[n]; - const float dx = pgi[i] * gamma[i / num]; - temp_gg += pgi[i] * x_hat; - temp_ds += dx * ps[i] * scale_pow; - } - warp_reduce_atomic_add(gamma_grad[k], temp_gg); - warp_reduce_atomic_add(dscale[n], temp_ds); - } - __syncthreads(); - - const float invnum = 1.0f / (ks * num); - for (auto n : grid_stride_range_y(0, ns)) - { - const auto ps = src + n * ks * num; - const auto pgi = gradient_input + n * ks * num; - const auto psg = src_grad + n * ks * num; - for (auto i : grid_stride_range(0, ks * num)) - { - const float dx = pgi[i] * gamma[i / num]; - psg[i] += dx * scale[n] + dscale[n] * 2 * ps[i] * invnum; - } - } - } - - void rms_normalize_gradient( - const tensor& gradient_input, - const tensor& scale, - const tensor& src, - const tensor& gamma, - tensor& src_grad, - tensor& gamma_grad, - resizable_tensor& dscale - ) - { - DLIB_CASSERT(src.num_samples() == scale.size()); - DLIB_CASSERT(have_same_dimensions(gamma, gamma_grad)); - DLIB_CASSERT(gamma.k() == src.k()); - DLIB_CASSERT(gamma.nr() == 1); - DLIB_CASSERT(gamma.nc() == 1); - DLIB_CASSERT(have_same_dimensions(gradient_input, src)); - DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad)); - - const long ns = src.num_samples(); - const long ks = src.k(); - const long num = src.nr() * src.nc(); - - gamma_grad = 0; - dscale.copy_size(scale); - dscale = 0; - - // Lancement du kernel CUDA - launch_kernel(_cuda_rms_normalize_gradient, max_jobs(ks * num, ns), - src_grad.device(), gamma_grad.device(), dscale.device(), - src.device(), gradient_input.device(), scale.device(), gamma.device(), - ns, ks, num); - } - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size) - { - for(auto i : grid_stride_range(0, size)) - { - size_t blk = i/block_size; - size_t j = i%block_size; - dest[blk*dest_stride + j] += src[blk*src_stride + j]; - } - } - - __global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size) - { - for(auto i : grid_stride_range(0, size)) - { - size_t blk = i/block_size; - size_t j = i%block_size; - dest[blk*dest_stride + j] = src[blk*src_stride + j]; - } - } - - void copy_tensor( - bool add_to, - tensor& dest, - size_t dest_k_offset, - const tensor& src, - size_t src_k_offset, - size_t count_k - ) - { - const size_t dest_sample_size = static_cast(dest.nc() * dest.nr() * dest.k()); - const size_t src_sample_size = static_cast(src.nc() * src.nr() * src.k()); - - const size_t block_size = count_k * dest.nc() * dest.nr(); - - DLIB_CASSERT(dest.num_samples() == src.num_samples() && - dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size"); - DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor"); - DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor"); - - float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr(); - const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();; - - if (add_to) - { - launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()), - dest_p, block_size*dest.num_samples(), - src_p, dest_sample_size, src_sample_size, block_size); - } - else - { - launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()), - dest_p, block_size*dest.num_samples(), - src_p, dest_sample_size, src_sample_size, block_size); - } - } - - __global__ void _cuda_copy_strided_tensor_add_to (float* dest, const float* src, - size_t ns, size_t nk, size_t nr, size_t nc, - size_t dk, size_t dr, size_t dc, - size_t sk, size_t sr, size_t sc) - { - for(auto i : grid_stride_range(0, ns*nk*nr*nc)) - { - size_t n,k,r,c; - unpack_idx(i, nk,nr,nc, n,k,r,c); - dest[pack_idx(dk,dr,dc, n,k,r,c)] += src[pack_idx(sk,sr,sc, n,k,r,c)]; - } - } - - __global__ void _cuda_copy_strided_tensor (float* dest, const float* src, - size_t ns, size_t nk, size_t nr, size_t nc, - size_t dk, size_t dr, size_t dc, - size_t sk, size_t sr, size_t sc) - { - for(auto i : grid_stride_range(0, ns*nk*nr*nc)) - { - size_t n,k,r,c; - unpack_idx(i, nk,nr,nc, n,k,r,c); - dest[pack_idx(dk,dr,dc, n,k,r,c)] = src[pack_idx(sk,sr,sc, n,k,r,c)]; - } - } - - void copy_tensor( - bool add_to, - tensor& dest, - size_t dk, size_t dnr, size_t dnc, - const tensor& src, - size_t sk, size_t snr, size_t snc, - size_t k, size_t nr, size_t nc - ) - { - - DLIB_CASSERT(dest.num_samples() == src.num_samples(), "All sources should fit into dest tensor size"); - DLIB_CASSERT(dest.k() - dk >= k && - dest.nr() - dnr >= nr && - dest.nc() - dnc >= nc, "Not enough space in dest tensor"); - DLIB_CASSERT(src.k() - sk >= k && - src.nr() - snr >= nr && - src.nc() - snc >= nc, "Not enough space in src tensor"); - - float* dest_p = dest.device() + dk * static_cast(dest.nc() * dest.nr()) \ - + dnr * static_cast(dest.nc()) \ - + dnc; - - const float* src_p = src.device() + sk * static_cast(src.nc() * src.nr()) \ - + snr * static_cast(src.nc()) \ - + snc; - - if (add_to) - { - launch_kernel(_cuda_copy_strided_tensor_add_to, max_jobs(dest.size()), - dest_p, src_p, dest.num_samples(), - k, nr, nc, - dest.k(), dest.nr(), dest.nc(), - src.k(), src.nr(), src.nc()); - } - else - { - launch_kernel(_cuda_copy_strided_tensor, max_jobs(dest.size()), - dest_p, src_p, dest.num_samples(), - k, nr, nc, - dest.k(), dest.nr(), dest.nc(), - src.k(), src.nr(), src.nc()); - } - } - - - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_transpose(size_t dsize, size_t dk, size_t dnr, size_t dnc, float* d, - size_t sk, size_t snr, int snc, const float* s, const bool add_to) - { - const auto plane_size = dnr * dnc; - const auto sample_size = dk * plane_size; - for (auto i : grid_stride_range(0, dsize)) - { - const auto n = i / sample_size; - const auto idx = i % plane_size; - const auto in_k = (i / plane_size) % dk; - const auto in_r = idx % dnc; - const auto in_c = idx / dnc; - - const auto in_idx = ((n * sk + in_k) * snr + in_r) * snc + in_c; - if (add_to) d[i] += s[in_idx]; - else d[i] = s[in_idx]; - } - } - - void transpose( - bool add_to, - tensor& dest, - const tensor& src - ) - { - DLIB_CASSERT(is_same_object(dest, src) == false); - DLIB_CASSERT(dest.num_samples() == src.num_samples() && - dest.k() == src.k() && - dest.nr() == src.nc() && - dest.nc() == src.nr(), - "Incompatible tensor dimensions."); - - launch_kernel(_cuda_transpose, max_jobs(dest.size()), dest.size(), - dest.k(), dest.nr(), dest.nc(), dest.device(), - src.k(), src.nr(), src.nc(), src.device(), add_to); - } - - // ---------------------------------------------------------------------------------------- - - // CUDA Kernels for ACT operations - __global__ void _cuda_compute_act_halt_probabilities( - float* halt_probs, - float* logits, - const float* input_data, - const float* W_halt, - float b_halt, - size_t batch_size, - size_t seq_len, - size_t d_model, - size_t num_channels, - size_t feature_dim - ) - { - const long total_positions = batch_size * seq_len; - - for (auto pos : grid_stride_range_y(0, total_positions)) - for (auto i : grid_stride_range(0, 1)) - logits[pos] = b_halt; - __syncthreads(); - - for (auto pos : grid_stride_range_y(0, total_positions)) - { - const long n = pos / seq_len; - const long s = pos % seq_len; - - float temp = 0; - for (auto feat_idx : grid_stride_range(0, feature_dim)) - { - const long c = feat_idx / d_model; - const long d = feat_idx % d_model; - - const long in_idx = ((n * num_channels + c) * seq_len + s) * d_model + d; - temp += input_data[in_idx] * W_halt[feat_idx]; - } - - warp_reduce_atomic_add(logits[pos], temp); - } - __syncthreads(); - - for (auto pos : grid_stride_range(0, total_positions)) - { - halt_probs[pos] = 1.0f / (1.0f + expf(-logits[pos])); - } - } - - void compute_act_halt_probabilities( - resizable_tensor& halt_probs, - resizable_tensor& logits, - const tensor& input_data, - const tensor& halt_params, - long batch_size, - long seq_len, - long feature_dim - ) - { - const long total_positions = batch_size * seq_len; - const long d_model = feature_dim / input_data.k(); - const long num_channels = input_data.k(); - - halt_probs.set_size(total_positions, 1, 1, 1); - logits.set_size(total_positions, 1, 1, 1); - - launch_kernel(_cuda_compute_act_halt_probabilities, - max_jobs(feature_dim, total_positions), - halt_probs.device(), - logits.device(), - input_data.device(), - halt_params.device(), - halt_params.host()[feature_dim], - batch_size, - seq_len, - d_model, - num_channels, - feature_dim); - } - - __global__ void _cuda_update_act_state( - float* output, - const float* input_data, - const float* halt_probs, - float* cumulative_halting, - float* remainders, - float* n_steps, - float* effective_weights, - size_t batch_size, - size_t seq_len, - size_t d_model, - size_t num_channels, - float halt_threshold, - long current_step - ) - { - for (auto pos : grid_stride_range(0, batch_size * seq_len)) - { - if (cumulative_halting[pos] < halt_threshold) - { - const size_t n = pos / seq_len; - const size_t s = pos % seq_len; - - float p = halt_probs[pos]; - float r = remainders[pos]; - float effective = fminf(p * r, halt_threshold - cumulative_halting[pos]); - - cumulative_halting[pos] += effective; - remainders[pos] -= effective; - n_steps[pos] = static_cast(current_step + 1); - effective_weights[pos] += effective; - - for (size_t c = 0; c < num_channels; ++c) { - for (size_t d = 0; d < d_model; ++d) { - const size_t idx = ((n * num_channels + c) * seq_len + s) * d_model + d; - output[idx] += effective * input_data[idx]; - } - } - } - } - } - - void update_act_state( - resizable_tensor& output, - const tensor& input_data, - const tensor& halt_probs, - resizable_tensor& cumulative_halting, - resizable_tensor& remainders, - resizable_tensor& n_steps, - resizable_tensor& effective_weights, - long batch_size, - long seq_len, - long d_model, - long num_channels, - float halt_threshold, - long current_step - ) - { - const long total_positions = batch_size * seq_len; - - launch_kernel(_cuda_update_act_state, - max_jobs(total_positions), - output.device(), - input_data.device(), - halt_probs.device(), - cumulative_halting.device(), - remainders.device(), - n_steps.device(), - effective_weights.device(), - batch_size, - seq_len, - d_model, - num_channels, - halt_threshold, - current_step); - } - - __global__ void _cuda_finalize_act_output( - float* output, - const float* input_data, - const float* remainders, - float* effective_weights, - size_t batch_size, - size_t seq_len, - size_t d_model, - size_t num_channels - ) - { - for (auto pos : grid_stride_range(0, batch_size * seq_len)) - { - float r = remainders[pos]; - if (r > 1e-6f) { - const size_t n = pos / seq_len; - const size_t s = pos % seq_len; - - effective_weights[pos] += r; - - for (size_t c = 0; c < num_channels; ++c) { - for (size_t d = 0; d < d_model; ++d) { - const size_t idx = ((n * num_channels + c) * seq_len + s) * d_model + d; - output[idx] += r * input_data[idx]; - } - } - } - } - } - - void finalize_act_output( - resizable_tensor& output, - const tensor& input_data, - const tensor& remainders, - resizable_tensor& effective_weights, - long batch_size, - long seq_len, - long d_model, - long num_channels - ) - { - const long total_positions = batch_size * seq_len; - - launch_kernel(_cuda_finalize_act_output, - max_jobs(total_positions), - output.device(), - input_data.device(), - remainders.device(), - effective_weights.device(), - batch_size, - seq_len, - d_model, - num_channels); - } - - __global__ void _cuda_apply_act_depth_scaling( - float* gradients, - const float* n_steps, - size_t batch_size, - size_t seq_len, - size_t d_model, - size_t num_channels, - float max_steps, - float scale_factor - ) - { - const long total_positions = batch_size * seq_len; - const long feature_dim = num_channels * d_model; - - for (auto pos : grid_stride_range_y(0, total_positions)) - { - const long n = pos / seq_len; - const long s = pos % seq_len; - const float scale = 1.0f + scale_factor * (n_steps[pos] / max_steps); - - for (auto feat_idx : grid_stride_range(0, feature_dim)) - { - const long c = feat_idx / d_model; - const long d = feat_idx % d_model; - const long idx = ((n * num_channels + c) * seq_len + s) * d_model + d; - gradients[idx] *= scale; - } - } - } - - void apply_act_depth_scaling( - tensor& gradients, - const tensor& n_steps, - long batch_size, - long seq_len, - long d_model, - long num_channels, - float max_steps, - float scale_factor - ) - { - const long total_positions = batch_size * seq_len; - const long feature_dim = num_channels * d_model; - - launch_kernel(_cuda_apply_act_depth_scaling, - max_jobs(feature_dim, total_positions), - gradients.device(), - n_steps.device(), - batch_size, - seq_len, - d_model, - num_channels, - max_steps, - scale_factor); - } - - // ---------------------------------------------------------------------------------------- - - - __device__ float cuda_log1pexp(float x) - { - if (x <= -18) - return std::exp(x); - else if (-18 < x && x <= 9) - return std::log1pf(std::exp(x)); - else if (9 < x && x <= 16) - return x + expf(-x); - else - return x; - } - - __global__ void _cuda_compute_loss_binary_log_per_pixel(float* loss_out, float* g, const float* truth, const float* out_data, size_t n, const float scale) - { - float loss = 0; - for(auto i : grid_stride_range(0, n)) - { - const float y = truth[i]; - - if (y > 0.f) - { - const float temp = cuda_log1pexp(-out_data[i]); - loss += y*temp; - g[i] = y*scale*(g[i]-1); - } - else if (y < 0.f) - { - const float temp = -(-out_data[i]-cuda_log1pexp(-out_data[i])); - loss += -y*temp; - g[i] = -y*scale*g[i]; - } - else - { - g[i] = 0.f; - } - } - - warp_reduce_atomic_add(*loss_out, loss); - } - - // ---------------------------------------------------------------------------------------- - - __device__ float cuda_safe_log(float x, float epsilon = 1e-10) - { - // Prevent trying to calculate the logarithm of a very small number (let alone zero) - if (x >= epsilon) - return ::log(x); - else - return ::log(epsilon); - } - - __global__ void _cuda_compute_loss_multiclass_log_per_pixel(float* loss_out, float* g, const uint16_t* truth, size_t n, size_t plane_size, size_t sample_size, size_t nk, uint16_t label_to_ignore, const float scale) - { - float loss = 0; - for(auto i : grid_stride_range(0, n)) - { - const size_t k = (i/plane_size)%nk; - const size_t idx = (i%plane_size) + plane_size*(i/sample_size); - - const size_t y = truth[idx]; - - if (k == y) - { - loss -= cuda_safe_log(g[i]); - g[i] = scale*(g[i] - 1); - } - else if (y == label_to_ignore) - { - g[i] = 0.f; - } - else - { - g[i] = scale*g[i]; - } - } - - warp_reduce_atomic_add(*loss_out, loss); - } - - __global__ void _cuda_compute_loss_multiclass_log_per_pixel_weighted(float* loss_out, float* g, const uint16_t* truth, size_t n, size_t plane_size, size_t sample_size, size_t nk, const float* weights, const float scale) - { - float loss = 0; - for(auto i : grid_stride_range(0, n)) - { - const size_t k = (i/plane_size)%nk; - const size_t idx = (i%plane_size) + plane_size*(i/sample_size); - - const size_t y = truth[idx]; - const float weight = weights[idx]; - - if (k == y) - { - loss -= weight*cuda_safe_log(g[i]); - g[i] = weight*scale*(g[i] - 1); - } - else - { - g[i] = weight*scale*g[i]; - } - } - - warp_reduce_atomic_add(*loss_out, loss); - } - // ---------------------------------------------------------------------------------------- - - __global__ void _cuda_compute_loss_mean_squared_per_channel_and_pixel(float* loss_out, float* g, const float* truth, const float* out_data, size_t n, const float scale) - { - float loss = 0; - for (auto i : grid_stride_range(0, n)) - { - const float y = truth[i]; - const float temp = y - out_data[i]; - loss += temp * temp; - g[i] = -temp * scale; - } - warp_reduce_atomic_add(*loss_out, loss); - } - - // ---------------------------------------------------------------------------------------- - - void compute_loss_binary_log_per_pixel:: - do_work( - cuda_data_ptr loss_work_buffer, - cuda_data_ptr truth_buffer, - const tensor& subnetwork_output, - tensor& gradient, - double& loss - ) - { - CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float))); - sigmoid(gradient, subnetwork_output); - - // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. - const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc()); - - launch_kernel(_cuda_compute_loss_binary_log_per_pixel, max_jobs(gradient.size()), - loss_work_buffer.data(), gradient.device(), truth_buffer.data(), subnetwork_output.device(), gradient.size(), scale); - - float floss; - dlib::cuda::memcpy(&floss, loss_work_buffer); - loss = scale*floss; - } - - void compute_loss_multiclass_log_per_pixel:: - do_work( - cuda_data_ptr loss_work_buffer, - cuda_data_ptr truth_buffer, - const tensor& subnetwork_output, - tensor& gradient, - double& loss - ) - { - CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float))); - softmax(gradient, subnetwork_output); - static const uint16_t label_to_ignore = std::numeric_limits::max(); - - // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. - const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc()); - - launch_kernel(_cuda_compute_loss_multiclass_log_per_pixel, max_jobs(gradient.size()), - loss_work_buffer.data(), gradient.device(), truth_buffer.data(), gradient.size(), gradient.nr()*gradient.nc(), gradient.nr()*gradient.nc()*gradient.k(), gradient.k(), label_to_ignore, scale); - - float floss; - dlib::cuda::memcpy(&floss, loss_work_buffer); - loss = scale*floss; - } - - void compute_loss_multiclass_log_per_pixel_weighted:: - do_work( - cuda_data_ptr loss_work_buffer, - cuda_data_ptr truth_buffer, - cuda_data_ptr weights_buffer, - const tensor& subnetwork_output, - tensor& gradient, - double& loss - ) - { - CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float))); - softmax(gradient, subnetwork_output); - - // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. - const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc()); - - launch_kernel(_cuda_compute_loss_multiclass_log_per_pixel_weighted, max_jobs(gradient.size()), - loss_work_buffer.data(), gradient.device(), truth_buffer.data(), gradient.size(), gradient.nr()*gradient.nc(), gradient.nr()*gradient.nc()*gradient.k(), gradient.k(), weights_buffer.data(), scale); - - float floss; - dlib::cuda::memcpy(&floss, loss_work_buffer); - loss = scale*floss; - } - - void compute_loss_mean_squared_per_channel_and_pixel:: - do_work( - cuda_data_ptr loss_work_buffer, - cuda_data_ptr truth_buffer, - const tensor& subnetwork_output, - tensor& gradient, - double& loss - ) - { - CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float))); - - // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. - const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.k() * subnetwork_output.nr() * subnetwork_output.nc()); - - launch_kernel(_cuda_compute_loss_mean_squared_per_channel_and_pixel , max_jobs(gradient.size()), - loss_work_buffer.data(), gradient.device(), truth_buffer.data(), subnetwork_output.device(), gradient.size(), scale); - - float floss; - dlib::cuda::memcpy(&floss, loss_work_buffer); - loss = scale*floss; - } - - // ---------------------------------------------------------------------------------------- - - } -} - +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. + +#include "cuda_utils.h" +#include "cuda_dlib.h" +#include "cudnn_dlibapi.h" +#include + + +namespace dlib +{ + namespace cuda + { + + // ----------------------------------------------------------------------------------- + + void set_device ( + int dev + ) + { + CHECK_CUDA(cudaSetDevice(dev)); + } + + int get_device ( + ) + { + int dev = 0; + CHECK_CUDA(cudaGetDevice(&dev)); + return dev; + } + + std::string get_device_name ( + int device + ) + { + cudaDeviceProp props; + CHECK_CUDA(cudaGetDeviceProperties(&props, device)); + return props.name; + } + + void set_current_device_blocking_sync( + ) + { + CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); + } + + int get_num_devices ( + ) + { + int num_devices; + CHECK_CUDA(cudaGetDeviceCount(&num_devices)); + return num_devices; + } + + bool can_access_peer (int device_id, int peer_device_id) + { + int can_access; + CHECK_CUDA(cudaDeviceCanAccessPeer(&can_access, device_id, peer_device_id)); + return can_access != 0; + } + bool can_access_peer (const tensor& device, const tensor& peer_device) + { + return can_access_peer(device.device_id(), peer_device.device_id()); + } + + void device_synchronize (int dev) + { + raii_set_device set_dev(dev); + CHECK_CUDA(cudaDeviceSynchronize()); + } + void device_synchronize (const tensor& dev) { device_synchronize(dev.device_id()); } + + enable_peer_access:: + enable_peer_access( + int device_id, + int peer_device_id + ) : call_disable(false), device_id(device_id), peer_device_id(peer_device_id) + { + raii_set_device set_dev(device_id); + + auto err = cudaDeviceEnablePeerAccess(peer_device_id, 0); + if (err == cudaSuccess) + { + call_disable = true; + } + else if (err == cudaErrorPeerAccessAlreadyEnabled) + { + // call cudaGetLastError() to dispose of this error since we don't + // care. + auto err2 = cudaGetLastError(); + if (err2 != cudaErrorPeerAccessAlreadyEnabled) + CHECK_CUDA(err2); + } + else + { + CHECK_CUDA(err); + } + } + + + enable_peer_access:: + ~enable_peer_access() noexcept(false) + { + if (call_disable) + { + raii_set_device set_dev(device_id); + CHECK_CUDA(cudaDeviceDisablePeerAccess(peer_device_id)); + } + } + + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_inverse_norms_accumulate( + float* invnorms, + const float* data, + size_t nr, + size_t nc + ) + { + for (auto i : grid_stride_range_y(0, nr)) + { + auto p = data + i * nc; + float temp = 0; + for (auto j : grid_stride_range(0, nc)) + temp += p[j] * p[j]; + + warp_reduce_atomic_add(invnorms[i], temp); + } + } + + __global__ void _cuda_inverse_norms_invert( + float* invnorms, + size_t nr + ) + { + for (auto i : grid_stride_range_y(0, nr)) + { + if (threadIdx.x == 0) + invnorms[i] = 1.0f / std::sqrt(invnorms[i]); + } + } + + void inverse_norms( + resizable_tensor& invnorms, + const tensor& data, + const double eps + ) + { + const auto nr = data.num_samples(); + const auto nc = data.size() / data.num_samples(); + + invnorms.set_size(nr); + invnorms = eps; + + launch_kernel(_cuda_inverse_norms_accumulate, max_jobs(nc, nr), + invnorms.device(), data.device(), nr, nc); + + launch_kernel(_cuda_inverse_norms_invert, max_jobs(1, nr), + invnorms.device(), nr); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_dot_prods(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc) + { + for (auto i : grid_stride_range_y(0, nr)) + { + auto l = lhs + i * nc; + auto r = rhs + i * nc; + float temp = 0; + for (auto j : grid_stride_range(0, nc)) + temp += l[j] * r[j]; + + warp_reduce_atomic_add(out[i], temp); + } + } + + __global__ void _cuda_dot_prods_add_to(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc) + { + for (auto i : grid_stride_range_y(0, nr)) + { + auto l = lhs + i * nc; + auto r = rhs + i * nc; + float temp = 0; + for (auto j : grid_stride_range(0, nc)) + temp += l[j] * r[j]; + + warp_reduce_atomic_add(out[i], temp); + } + } + + void dot_prods( + resizable_tensor& out, + const tensor& lhs, + const tensor& rhs + ) + { + DLIB_CASSERT(have_same_dimensions(lhs, rhs)); + + out.set_size(lhs.num_samples()); + if (out.size() == 0) + return; + + const auto nr = lhs.num_samples(); + const auto nc = lhs.size() / lhs.num_samples(); + + out = 0; + launch_kernel(_cuda_dot_prods, max_jobs(nc, nr), + out.device(), lhs.device(), rhs.device(), nr, nc); + } + + void dot_prods( + bool add_to, + tensor& out, + const tensor& lhs, + const tensor& rhs + ) + { + DLIB_CASSERT(have_same_dimensions(lhs, rhs)); + DLIB_CASSERT(out.k() == 1 && out.nr() == 1 && out.nc() == 1); + DLIB_CASSERT(out.num_samples() == lhs.num_samples()); + + const auto nr = lhs.num_samples(); + const auto nc = lhs.size() / lhs.num_samples(); + + if (add_to) + { + launch_kernel(_cuda_dot_prods_add_to, max_jobs(nc, nr), + out.device(), lhs.device(), rhs.device(), nr, nc); + } + else + { + out = 0; + launch_kernel(_cuda_dot_prods, max_jobs(nc, nr), + out.device(), lhs.device(), rhs.device(), nr, nc); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_scale_columns(float* out, const float* m, const float* v, size_t nr, size_t nc) + { + for (auto j : grid_stride_range(0, nr*nc)) + { + out[j] = m[j]*v[j%nc]; + } + } + + void scale_columns ( + tensor& out, + const tensor& m, + const tensor& v + ) + { + launch_kernel(_cuda_scale_columns, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_scale_rows(float* out, const float* m, const float* v, size_t nr, size_t nc) + { + for (auto j : grid_stride_range(0, nr*nc)) + { + out[j] = m[j]*v[j/nc]; + } + } + + void scale_rows ( + tensor& out, + const tensor& m, + const tensor& v + ) + { + launch_kernel(_cuda_scale_rows, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_scale_rows2(float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc) + { + for (auto j : grid_stride_range(0, nr*nc)) + { + out[j] = (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc]; + } + } + + __global__ void _cuda_scale_rows2_beta(const float beta, float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc) + { + for (auto j : grid_stride_range(0, nr*nc)) + { + out[j] = beta*out[j] + (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc]; + } + } + + void scale_rows2 ( + float beta, + tensor& out, + const tensor& m1, + const tensor& m2, + const tensor& v1, + const tensor& v2 + ) + { + if (beta == 0) + { + launch_kernel(_cuda_scale_rows2, max_jobs(m1.size()), out.device(), + m1.device(), m2.device(), v1.device(), v2.device(), m1.num_samples(), + m1.size()/m1.num_samples()); + } + else + { + launch_kernel(_cuda_scale_rows2_beta, max_jobs(m1.size()), beta, + out.device(), m1.device(), m2.device(), v1.device(), v2.device(), + m1.num_samples(), m1.size()/m1.num_samples()); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_exp(float* dest, const float* src, size_t n) + { + for (auto i : grid_stride_range(0, n)) + dest[i] = ::exp(src[i]); + } + + void exp ( + tensor& dest, + const tensor& src + ) + { + DLIB_ASSERT(dest.size() == src.size()); + launch_kernel(_cuda_exp, max_jobs(src.size()), dest.device(), src.device(), src.size()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_log(float* dest, const float* src, size_t n) + { + for (auto i : grid_stride_range(0, n)) + dest[i] = ::log(src[i]); + } + + void log ( + tensor& dest, + const tensor& src + ) + { + DLIB_ASSERT(dest.size() == src.size()); + launch_kernel(_cuda_log, max_jobs(src.size()), dest.device(), src.device(), src.size()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_log10(float* dest, const float* src, size_t n) + { + for (auto i : grid_stride_range(0, n)) + dest[i] = ::log10(src[i]); + } + + void log10 ( + tensor& dest, + const tensor& src + ) + { + DLIB_ASSERT(dest.size() == src.size()); + launch_kernel(_cuda_log10, max_jobs(src.size()), dest.device(), src.device(), src.size()); + } + + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_multiply1(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s1[i]*s2[i]; + } + } + __global__ void _cuda_multiply2(float* d, const float* s1, const float* s2, + size_t n, size_t s1_n, size_t s2_n, size_t max_size) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = 0; + for (size_t j = i; j < max_size; j += n) + d[i] += s1[j%s1_n]*s2[j%s2_n]; + } + } + + __global__ void _cuda_multiply3(float* d, const float* s1, const float* s2, + size_t n, size_t s1_n, size_t s2_n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s1[i%s1_n]*s2[i%s2_n]; + } + } + + __global__ void _cuda_multiply1_add_to(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] += s1[i]*s2[i]; + } + } + __global__ void _cuda_multiply2_add_to(float* d, const float* s1, const float* s2, + size_t n, size_t s1_n, size_t s2_n, size_t max_size) + { + for (auto i : grid_stride_range(0, n)) + { + for (size_t j = i; j < max_size; j += n) + d[i] += s1[j%s1_n]*s2[j%s2_n]; + } + } + + __global__ void _cuda_multiply3_add_to(float* d, const float* s1, const float* s2, + size_t n, size_t s1_n, size_t s2_n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] += s1[i%s1_n]*s2[i%s2_n]; + } + } + + void multiply ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + + DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() && + dest.nr() == src1.nr() && src1.nr() == src2.nr() && + dest.nc() == src1.nc() && src1.nc() == src2.nc() ); + const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples()); + DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) && + (src1.num_samples()==1 || src1.num_samples()==MD) && + (src2.num_samples()==1 || src2.num_samples()==MD) ); + + if (dest.size() == 0) + return; + + const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size()); + const auto d = dest.host(); + const auto s1 = src1.host(); + const auto s2 = src2.host(); + if (dest.size() == src1.size() && src1.size() == src2.size()) + { + if (add_to) + launch_kernel(_cuda_multiply1_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size()); + else + launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size()); + } + else if (dest.num_samples() == 1) + { + if (add_to) + launch_kernel(_cuda_multiply2_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), + dest.size(), src1.size(), src2.size(), max_size); + else + launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), + dest.size(), src1.size(), src2.size(), max_size); + } + else + { + if (add_to) + launch_kernel(_cuda_multiply3_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), + dest.size(), src1.size(), src2.size()); + else + launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), + dest.size(), src1.size(), src2.size()); + } + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_multiply_conv(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = (i / bs) % ks; + d[i] = s1[i] * s2[k]; + } + } + + __global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) + { + // loop over all the image planes + for (auto i : grid_stride_range_y(0, n)) + { + // sum all the elements in the i-th image plane + float temp = 0; + for (auto j : grid_stride_range(i* bs, (i + 1)* bs)) + temp += s1[j] * s2[j]; + auto k = i % ks; + // and store the sum into d[k] + warp_reduce_atomic_add(d[k], temp); + } + } + + __global__ void _cuda_multiply_conv_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = (i / bs) % ks; + d[i] += s1[i] * s2[k]; + } + } + + __global__ void _cuda_multiply_conv2_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) + { + // loop over all the image planes + for (auto i : grid_stride_range_y(0, n)) + { + // sum all the elements in the i-th image plane + float temp = 0; + for (auto j : grid_stride_range(i* bs, (i + 1)* bs)) + temp += s1[j] * s2[j]; + auto k = i % ks; + // and store the sum into d[k] + warp_reduce_atomic_add(d[k], temp); + } + } + + void multiply_conv( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + if (have_same_dimensions(dest, src1)) + { + DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k()); + if (add_to) + { + launch_kernel(_cuda_multiply_conv_add_to, max_jobs(dest.size()), + dest.device(), src1.device(), src1.size(), src2.device(), src1.nr() * src1.nc(), src1.k()); + } + else + { + launch_kernel(_cuda_multiply_conv, max_jobs(dest.size()), + dest.device(), src1.device(), src1.size(), src2.device(), src1.nr() * src1.nc(), src1.k()); + } + } + else + { + DLIB_CASSERT(src1.num_samples() == src2.num_samples() && src1.k() == src2.k() && + src1.nr() == src2.nr() && src1.nc() == src2.nc()); + DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k()); + + const auto bs = src1.nr() * src1.nc(); + const auto n = src1.num_samples() * src1.k(); + + if (add_to) + { + launch_kernel(_cuda_multiply_conv2_add_to, max_jobs(bs, n), + dest.device(), src1.device(), n, src2.device(), bs, src1.k()); + } + else + { + dest = 0; + launch_kernel(_cuda_multiply_conv2, max_jobs(bs, n), + dest.device(), src1.device(), n, src2.device(), bs, src1.k()); + } + } + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_scale_channels_add_to(float* d, const float* src, size_t n, const float* scales, size_t bs) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = i/bs; + d[i] += src[i]*scales[k]; + } + } + + __global__ void _cuda_scale_channels(float* d, const float* src, size_t n, const float* scales, size_t bs) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = i/bs; + d[i] = src[i]*scales[k]; + } + } + + void scale_channels ( + bool add_to, + tensor& dest, + const tensor& src, + const tensor& scales + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src) && + scales.num_samples() == src.num_samples() && + scales.k() == src.k() && + scales.nr() == 1 && + scales.nc() == 1 ); + + if (dest.size() == 0) + return; + + if (add_to) + launch_kernel(_cuda_scale_channels_add_to,max_jobs(dest.size()), + dest.device(), src.device(), src.size(), scales.device(), src.nr()*src.nc()); + else + launch_kernel(_cuda_scale_channels,max_jobs(dest.size()), + dest.device_write_only(), src.device(), src.size(), scales.device(), src.nr()*src.nc()); + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s1[i]*s2[i]; + } + } + + __global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] += s1[i]*s2[i]; + } + } + + __global__ void _cuda_mult2(float* d, const float* s1, const float* s2, + size_t dn, size_t dk, size_t dr, size_t dc, + size_t s1n, size_t s1k, size_t s1r, size_t s1c, + size_t s2n, size_t s2k, size_t s2r, size_t s2c) + { + for (auto i : grid_stride_range(0, dn*dk*dr*dc)) + { + size_t n,k,r,c; + unpack_idx(i, dk,dr,dc, n,k,r,c); + + float v1 = 0; + float v2 = 0; + + if (n < s1n && + k < s1k && + r < s1r && + c < s1c ) + { + v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; + } + + if (n < s2n && + k < s2k && + r < s2r && + c < s2c ) + { + v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; + } + + d[i] = v1*v2; + } + } + + __global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2, + size_t dn, size_t dk, size_t dr, size_t dc, + size_t s1n, size_t s1k, size_t s1r, size_t s1c, + size_t s2n, size_t s2k, size_t s2r, size_t s2c) + { + for (auto i : grid_stride_range(0, dn*dk*dr*dc)) + { + size_t n,k,r,c; + unpack_idx(i, dk,dr,dc, n,k,r,c); + + float v1 = 0; + float v2 = 0; + + if (n < s1n && + k < s1k && + r < s1r && + c < s1c ) + { + v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; + } + + if (n < s2n && + k < s2k && + r < s2r && + c < s2c ) + { + v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; + } + + d[i] += v1*v2; + } + } + + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + if (dest.size() == 0) + return; + + // Do the simple and fast version if everything has the same dimensions + if (have_same_dimensions(dest, src1) && + have_same_dimensions(dest, src2)) + { + if (add_to) + launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); + else + launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); + } + else + { + if (add_to) + { + // Otherwise, do the more complex version with bounds checking. + launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()), + dest.device(), src1.device(), src2.device(), + dest.num_samples(), dest.k(), dest.nr(), dest.nc(), + src1.num_samples(), src1.k(), src1.nr(), src1.nc(), + src2.num_samples(), src2.k(), src2.nr(), src2.nc() + ); + } + else + { + // Otherwise, do the more complex version with bounds checking. + launch_kernel(_cuda_mult2,max_jobs(dest.size()), + dest.device(), src1.device(), src2.device(), + dest.num_samples(), dest.k(), dest.nr(), dest.nc(), + src1.num_samples(), src1.k(), src1.nr(), src1.nc(), + src2.num_samples(), src2.k(), src2.nr(), src2.nc() + ); + } + } + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s1[i]+s2[i]; + } + } + + __global__ void _cuda_add2(float* d, const float* s1, const float* s2, + size_t dn, size_t dk, size_t dr, size_t dc, + size_t s1n, size_t s1k, size_t s1r, size_t s1c, + size_t s2n, size_t s2k, size_t s2r, size_t s2c) + { + for (auto i : grid_stride_range(0, dn*dk*dr*dc)) + { + size_t n,k,r,c; + unpack_idx(i, dk,dr,dc, n,k,r,c); + + float v1 = 0; + float v2 = 0; + + if (n < s1n && + k < s1k && + r < s1r && + c < s1c ) + { + v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; + } + + if (n < s2n && + k < s2k && + r < s2r && + c < s2c ) + { + v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; + } + + d[i] = v1+v2; + } + } + + void add ( + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + if (dest.size() == 0) + return; + + // Do the simple and fast version if everything has the same dimensions + if (have_same_dimensions(dest, src1) && + have_same_dimensions(dest, src2)) + { + launch_kernel(_cuda_add1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); + } + else + { + // Otherwise, do the more complex version with bounds checking. + launch_kernel(_cuda_add2,max_jobs(dest.size()), + dest.device(), src1.device(), src2.device(), + dest.num_samples(), dest.k(), dest.nr(), dest.nc(), + src1.num_samples(), src1.k(), src1.nr(), src1.nc(), + src2.num_samples(), src2.k(), src2.nr(), src2.nc() + ); + } + + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_affine_transform1(float* d, const float* s, size_t n, float A, float B) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s[i] + B; + } + } + + __global__ void _cuda_affine_transform1_0(float* d, const float* s, size_t n, float A) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s[i]; + } + } + + void affine_transform( + tensor& dest, + const tensor& src, + const float A, + const float B + ) + { + DLIB_CASSERT(dest.size()==src.size()); + if (B != 0) + launch_kernel(_cuda_affine_transform1,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A, B); + else + launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A); + } + + void affine_transform( + tensor& dest, + const tensor& src, + const float A + ) + { + DLIB_CASSERT(dest.size()==src.size()); + launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform_rect( + float* d, + const float* s1, + const float* s2, + const float* s3, + float A, + float B, + float C, + size_t start_idx, + size_t n, + size_t rect_nc, + size_t total_nc + ) + { + for (auto i : grid_stride_range(0, n)) + { + size_t r = i/rect_nc; + size_t c = i%rect_nc; + size_t idx = r*total_nc + c + start_idx; + d[idx] = A*s1[idx] + B*s2[idx] + C*s3[idx]; + } + } + + void affine_transform( + const rectangle& rect, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + float A, + float B, + float C + ) + { + DLIB_CASSERT(dest.size() == src1.size()); + DLIB_CASSERT(dest.size() == src2.size()); + DLIB_CASSERT(dest.size() == src3.size()); + DLIB_CASSERT(dest.num_samples() == src1.num_samples()); + DLIB_CASSERT(dest.num_samples() == src2.num_samples()); + DLIB_CASSERT(dest.num_samples() == src3.num_samples()); + DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect)); + launch_kernel(_cuda_affine_transform_rect,max_jobs(rect.area()), + dest.device(), src1.device(), src2.device(), src3.device(), A, B, C, + rect.left() + rect.top()*(dest.size()/dest.num_samples()), + rect.area(), + rect.width(), + dest.size()/dest.num_samples()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform4(float* d, const float* s1, const float* s2, size_t n, float A, float B, float C) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s1[i] + B*s2[i] + C; + } + } + + __global__ void _cuda_affine_transform4_0(float* d, const float* s1, const float* s2, size_t n, float A, float B) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s1[i] + B*s2[i]; + } + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B, + const float C + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + if (C != 0) + launch_kernel(_cuda_affine_transform4,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B, C); + else + launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B); + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_add_scaled(float* d, const float* s, size_t n, float scale) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] += scale*s[i]; + } + } + + void add_scaled( + tensor& dest, + const float scale, + const tensor& src + ) + { + DLIB_CASSERT(dest.size()==src.size()); + launch_kernel(_cuda_add_scaled,max_jobs(dest.size()),dest.device(), src.device(), dest.size(), scale); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_add_cv_to_all_columns(float beta, float* dest, float alpha, const float* src, size_t size, size_t stride) + { + for (auto i : grid_stride_range(0, size)) + { + dest[i] = beta*dest[i] + alpha*src[i/stride]; + } + } + + __global__ void _cuda_add_cv_to_all_columns_no_beta(float* dest, float alpha, const float* src, size_t size, size_t stride) + { + for (auto i : grid_stride_range(0, size)) + { + dest[i] = alpha*src[i/stride]; + } + } + + void add_cv_to_all_columns( + float beta, + tensor& dest, + float alpha, + const tensor& src + ) + { + DLIB_CASSERT(dest.num_samples() == src.num_samples() && src.num_samples() == src.size()); + if (beta == 0) + launch_kernel(_cuda_add_cv_to_all_columns_no_beta, max_jobs(dest.size()), dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples()); + else + launch_kernel(_cuda_add_cv_to_all_columns, max_jobs(dest.size()), beta, dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform5( + float* d, const float* s1, const float* s2, const float* s3, size_t n, float A, float B, float C, float D + ) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D; + } + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C, + const float D + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + DLIB_CASSERT(dest.size()==src3.size()); + launch_kernel(_cuda_affine_transform5,max_jobs(dest.size()),dest.device(), src1.device(), + src2.device(), src3.device(), dest.size(), A, B, C, D); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform_range( + float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C + ) + { + for (auto i : grid_stride_range(begin, end)) + { + d[i] = A*s1[i] + B*s2[i] + C*s3[i]; + } + } + + + void affine_transform_range( + size_t begin, + size_t end, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + DLIB_CASSERT(dest.size()==src3.size()); + DLIB_CASSERT(begin <= end && end <= dest.size()); + launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin), + dest.device(), src1.device(), + src2.device(), src3.device(), begin, end, A, B, C); + } + + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A[i]*s[i] + B[i]; + } + } + __global__ void _cuda_affine_transform3(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A[i%bs]*s[i] + B[i%bs]; + } + } + + void affine_transform( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ) + { + DLIB_CASSERT(have_same_dimensions(dest, src)); + DLIB_CASSERT( + ((A.num_samples()==1 && B.num_samples()==1) || + (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples()))); + DLIB_CASSERT( + A.nr()==B.nr() && B.nr()==src.nr() && + A.nc()==B.nc() && B.nc()==src.nc() && + A.k() ==B.k() && B.k()==src.k(), + "\nA.nr(): " << A.nr() << "\nB.nr(): " << B.nr() << "\nsrc.nr(): " << src.nr() + <<"\nA.nc(): " << A.nc() << "\nB.nc(): " << B.nc() << "\nsrc.nc(): " << src.nc() + <<"\nA.k(): " << A.k() << "\nB.k(): " << B.k() << "\nsrc.k(): " << src.k() + ); + + if (A.num_samples() == 1) + { + launch_kernel(_cuda_affine_transform3,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device(), A.size()); + } + else + { + launch_kernel(_cuda_affine_transform2,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device()); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_compute_adam_update( + size_t begin, + size_t end, + float* s, + float* m, + float* v, + const float alpha, + const float weight_decay, + const float momentum1, + const float momentum2, + const float* params, + const float* params_grad + ) + { + const float eps = 1e-8; + // The loop is equivalent to doing this: + // m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad); + // v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad); + // s = -alpha*m/(sqrt(v) + eps); + for (auto i : grid_stride_range(begin, end)) + { + float g = (weight_decay*params[i] + params_grad[i]); + m[i] = momentum1*m[i] + (1-momentum1)*g; + v[i] = momentum2*v[i] + (1-momentum2)*g*g; + s[i] = -alpha*m[i]/(std::sqrt(v[i]) + eps); + } + } + + void compute_adam_update ( + size_t begin, + size_t end, + tensor& s, + tensor& m, + tensor& v, + const float t, + const float learning_rate, + const float weight_decay, + const float momentum1, + const float momentum2, + const tensor& params, + const tensor& params_grad + ) + { + DLIB_CASSERT(s.size() == m.size() && + s.size() == v.size() && + s.size() == params.size() && + s.size() == params_grad.size()); + DLIB_CASSERT(begin <= end && end <= params.size()); + const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t)); + + launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin), + begin, end, s.device(), m.device(), v.device(), alpha, weight_decay, + momentum1, momentum2, params.device(), params_grad.device()); + } + + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform_conv(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs, size_t ks) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = (i/bs)%ks; + d[i] = A[k]*s[i] + B[k]; + } + } + + void affine_transform_conv( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ) + { + DLIB_CASSERT(have_same_dimensions(dest, src)); + DLIB_CASSERT(have_same_dimensions(A, B)); + DLIB_CASSERT(A.num_samples() == 1 && A.nr() == 1 && A.nc() == 1 && A.k() == src.k()); + + launch_kernel(_cuda_affine_transform_conv,max_jobs(dest.size()), + dest.device(), src.device(), src.size(), A.device(), B.device(), src.nr()*src.nc(), src.k()); + } + + // ----------------------------------------------------------------------------------- + + __global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n) + { + for (auto i : grid_stride_range(0, n)) + { + out[i] = in[i]; + for (size_t j = i+n; j < total_n; j+=n) + out[i] += in[j]; + } + } + + void assign_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + grad.num_samples() == 1 && + gradient_input.k() == grad.k() && + gradient_input.nr() == grad.nr() && + gradient_input.nc() == grad.nc() && + gradient_input.size() > 0); + + launch_kernel(_add_bias_gradient,max_jobs(grad.size()),grad.device(), gradient_input.device(), grad.size(), gradient_input.size()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _set_tensor(float* out, size_t n, const float val) + { + for (auto i : grid_stride_range(0, n)) + out[i] = val; + } + + void set_tensor ( + tensor& t, + float value + ) + { + launch_kernel(_set_tensor, max_jobs(t.size()), t.device(), t.size(), value); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _scale_tensor(float* out, size_t n, const float val) + { + for (auto i : grid_stride_range(0, n)) + out[i] *= val; + } + + void scale_tensor ( + tensor& t, + float value + ) + { + launch_kernel(_scale_tensor, max_jobs(t.size()), t.device(), t.size(), value); + } + + // ----------------------------------------------------------------------------------- + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_threshold(float* d, size_t n, float thresh) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = d[i]>thresh ? 1:0; + } + } + + void threshold ( + tensor& data, + float thresh + ) + { + launch_kernel(_cuda_threshold,max_jobs(data.size()),data.device(), data.size(), thresh); + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_dot(const float* a, const float* b, size_t n, float* result) + { + // Parallel sum everything into local temp variables. + float temp = 0; + for(auto i : grid_stride_range(0, n)) + temp += a[i]*b[i]; + + // Then do the warp reduce add thing to merge into one output value. + warp_reduce_atomic_add(*result, temp); + } + + + void dot ( + const tensor& a, + const tensor& b, + tensor& result, + size_t idx + ) + { + DLIB_CASSERT(a.size() == b.size()); + DLIB_CASSERT(idx < result.size()); + + launch_kernel(_cuda_dot, max_jobs(a.size()), a.device(), b.device(), a.size(), result.device()+idx); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_prelu(const float* s, float* d, size_t n, const float* pp) + { + const float p = *pp; + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + d[i] = s[i]; + else + d[i] = p*s[i]; + } + } + + void prelu ( + tensor& dest, + const tensor& src, + const tensor& param + ) + { + launch_kernel(_cuda_prelu, max_jobs(dest.size()), + src.device(), dest.device(), src.size(), param.device()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_prelu_gradient(float* out, const float* s, const float* gi, size_t n, const float* pp, float* ppgrad) + { + const float p = *pp; + float pgrad = 0; + for(auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + { + out[i] += gi[i]; + } + else + { + out[i] += p*gi[i]; + pgrad += gi[i]*s[i]; + } + } + + // Then do the warp reduce add thing to merge into one output value. + warp_reduce_atomic_add(*ppgrad, pgrad); + } + + void prelu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input, + const tensor& param, + tensor& params_grad + ) + { + params_grad = 0; + launch_kernel(_cuda_prelu_gradient, max_jobs(grad.size()), + grad.device(), src.device(), gradient_input.device(), grad.size(), + param.device(), params_grad.device()); + } + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_leaky_relu(const float* s, float* d, size_t n, const float alpha) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + d[i] = s[i]; + else + d[i] = alpha * s[i]; + } + } + + void leaky_relu( + tensor& dest, + const tensor& src, + const float alpha + ) + { + launch_kernel(_cuda_leaky_relu, max_jobs(dest.size()), + src.device(), dest.device(), src.size(), alpha); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_leaky_relu_gradient_inplace(float* out, const float* s, const float* gi, size_t n, const float alpha) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + out[i] = gi[i]; + else + out[i] = alpha * gi[i]; + } + } + + __global__ void _cuda_leaky_relu_gradient(float* out, const float* s, const float* gi, size_t n, const float alpha) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + out[i] += gi[i]; + else + out[i] += alpha * gi[i]; + } + } + + void leaky_relu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input, + const float alpha + ) + { + float* out = grad.device(); + const float* gi = gradient_input.device(); + if (out == gi) + { + launch_kernel(_cuda_leaky_relu_gradient_inplace, max_jobs(grad.size()), + out, src.device(), gi, grad.size(), alpha); + } + else + { + launch_kernel(_cuda_leaky_relu_gradient, max_jobs(grad.size()), + out, src.device(), gi, grad.size(), alpha); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_mish(const float* s, float* d, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + const auto e = std::exp(s[i]); + const auto delta = 2*e + e*e + 2; + d[i] = s[i] - 2*s[i]/delta; + } + } + + void mish ( + tensor& dest, + const tensor& src + ) + { + launch_kernel(_cuda_mish, max_jobs(dest.size()), src.device(), dest.device(), src.size()); + } + + // ---------------------------------------------------------------------------------------- + + __device__ float mish_compute_gradient(float x) + { + if (x >= 8) + return 1.f; + if (x <= -8) + return 0.f; + + const auto e = std::exp(x); + const auto delta = 2*e + e*e + 2; + const auto omega = 4*(x + 1) + 4*e*e + e*e*e + e*(4*x + 6); + return e*omega/(delta*delta); + } + + __global__ void _cuda_mish_gradient_inplace(float* out, const float* s, const float* gi, size_t n) + { + for (auto i : grid_stride_range(0, n)) + out[i] = gi[i]*mish_compute_gradient(s[i]); + } + + __global__ void _cuda_mish_gradient(float* out, const float* s, const float* gi, size_t n) + { + for (auto i : grid_stride_range(0, n)) + out[i] += gi[i]*mish_compute_gradient(s[i]); + } + + void mish_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input + ) + { + float* out = grad.device(); + const float* gi = gradient_input.device(); + if (out == gi) + launch_kernel(_cuda_mish_gradient_inplace, max_jobs(grad.size()), out, src.device(), gi, grad.size()); + else + launch_kernel(_cuda_mish_gradient, max_jobs(grad.size()), out, src.device(), gi, grad.size()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_clipped_relu(const float* s, float* d, size_t n, const float alpha) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] < 0) + d[i] = 0; + else if (s[i] > alpha) + d[i] = alpha; + else + d[i] = s[i]; + } + } + + void clipped_relu ( + tensor& dest, + const tensor &src, + const float alpha + ) + { + launch_kernel(_cuda_clipped_relu, max_jobs(dest.size()), + src.device(), dest.device(), src.size(), alpha); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_clipped_relu_gradient_inplace(float* out, const float* s, const float* gi, size_t n, const float alpha) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0 && s[i] < alpha) + out[i] = gi[i]; + else + out[i] = 0.f; + } + } + + __global__ void _cuda_clipped_relu_gradient(float* out, const float* s, const float* gi, size_t n, const float alpha) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0 && s[i] < alpha) + out[i] += gi[i]; + } + } + + void clipped_relu_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input, + const float alpha + ) + { + float* out = grad.device(); + const float* gi = gradient_input.device(); + if (out == gi) + launch_kernel(_cuda_clipped_relu_gradient_inplace, max_jobs(grad.size()), out, dest.device(), gi, grad.size(), alpha); + else + launch_kernel(_cuda_clipped_relu_gradient, max_jobs(grad.size()), out, dest.device(), gi, grad.size(), alpha); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_elu(const float* s, float* d, size_t n, const float alpha) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + d[i] = s[i]; + else + d[i] = alpha * (std::exp(s[i]) - 1.0f); + } + } + + void elu ( + tensor& dest, + const tensor &src, + const float alpha + ) + { + launch_kernel(_cuda_elu, max_jobs(dest.size()), src.device(), dest.device(), src.size(), alpha); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_elu_gradient_inplace(float* out, const float* s, const float* gi, size_t n, const float alpha) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + out[i] = gi[i]; + else + out[i] = (alpha + s[i]) * gi[i]; + } + } + + __global__ void _cuda_elu_gradient(float* out, const float* s, const float* gi, size_t n, const float alpha) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + out[i] += gi[i]; + else + out[i] += (alpha + s[i]) * gi[i]; + } + } + + void elu_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input, + const float alpha + ) + { + float* out = grad.device(); + const float* gi = gradient_input.device(); + if (out == gi) + launch_kernel(_cuda_elu_gradient_inplace, max_jobs(grad.size()), out, dest.device(), gi, grad.size(), alpha); + else + launch_kernel(_cuda_elu_gradient, max_jobs(grad.size()), out, dest.device(), gi, grad.size(), alpha); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_gelu(const float* s, float* d, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s[i] * normcdf(s[i]); + } + } + + void gelu ( + tensor& dest, + const tensor& src + ) + { + launch_kernel(_cuda_gelu, max_jobs(dest.size()), src.device(), dest.device(), src.size()); + } + + // ---------------------------------------------------------------------------------------- + + __device__ float gelu_compute_gradient(float x) + { + const float beta = 1.0f / CUDART_SQRT_2PI; + const float cdf = normcdf(x); + const float pdf = beta*std::exp(-0.5f*x*x); + return cdf + x * pdf; + } + + __global__ void _cuda_gelu_gradient_inplace(float* out, const float* s, const float* gi, size_t n) + { + for (auto i : grid_stride_range(0, n)) + out[i] = gi[i]*gelu_compute_gradient(s[i]); + } + + __global__ void _cuda_gelu_gradient(float* out, const float* s, const float* gi, size_t n) + { + for (auto i : grid_stride_range(0, n)) + out[i] += gi[i]*gelu_compute_gradient(s[i]); + } + + void gelu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input + ) + { + float* out = grad.device(); + const float* gi = gradient_input.device(); + if (out == gi) + launch_kernel(_cuda_gelu_gradient_inplace, max_jobs(grad.size()), out, src.device(), gi, grad.size()); + else + launch_kernel(_cuda_gelu_gradient, max_jobs(grad.size()), out, src.device(), gi, grad.size()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_smelu (const float* s, float* d, size_t n, const float beta) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] >= beta) + d[i] = s[i]; + else if (s[i] <= -beta) + d[i] = 0; + else + d[i] = (s[i] + beta) * (s[i] + beta) / (4 * beta); + } + } + + void smelu ( + tensor& dest, + const tensor& src, + const float beta + ) + { + launch_kernel(_cuda_smelu, max_jobs(dest.size()), src.device(), dest.device(), src.size(), beta); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_smelu_gradient_inplace(float* out, const float* s, const float* gi, size_t n, const float beta) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] >= beta) + out[i] = gi[i]; + else if (s[i] == 0) + out[i] = 0; + else + out[i] = std::sqrt(beta * s[i]) / beta * gi[i]; + } + } + + __global__ void _cuda_smelu_gradient(float* out, const float* s, const float* gi, size_t n, const float beta) + { + for (auto i : grid_stride_range(0, n)) + { + if (s[i] >= beta) + out[i] += gi[i]; + else if (s[i] == 0) + continue; + else + out[i] += std::sqrt(beta * s[i]) / beta * gi[i]; + } + } + + void smelu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input, + const float beta + ) + { + float* out = grad.device(); + const float* gi = gradient_input.device(); + if (out == gi) + { + launch_kernel(_cuda_smelu_gradient_inplace, max_jobs(grad.size()), + out, src.device(), gi, grad.size(), beta); + } + else + { + launch_kernel(_cuda_smelu_gradient, max_jobs(grad.size()), + out, src.device(), gi, grad.size(), beta); + } + } + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_silu(const float* s, float* d, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s[i] / (1.0f + std::exp(-s[i])); + } + } + + void silu ( + tensor& dest, + const tensor& src + ) + { + launch_kernel(_cuda_silu, max_jobs(dest.size()), src.device(), dest.device(), src.size()); + } + + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_silu_gradient_inplace(float* out, const float* s, const float* gi, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + const auto sig_s = 1.0f / (1.0f + std::exp(-s[i])); + out[i] = gi[i] * (sig_s * (1.0f + s[i] * (1.0f - sig_s))); + } + } + + __global__ void _cuda_silu_gradient(float* out, const float* s, const float* gi, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + const auto sig_s = 1.0f / (1.0f + std::exp(-s[i])); + out[i] += gi[i] * (sig_s * (1.0f + s[i] * (1.0f - sig_s))); + } + } + + void silu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input + ) + { + float* out = grad.device(); + const float* gi = gradient_input.device(); + if (out == gi) + launch_kernel(_cuda_silu_gradient_inplace, max_jobs(grad.size()), out, src.device(), gi, grad.size()); + else + launch_kernel(_cuda_silu_gradient, max_jobs(grad.size()), out, src.device(), gi, grad.size()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_resize_bilinear(size_t dsize, size_t dchan_size, size_t dnc, float* d, + size_t schan_size, int snr, int snc, const float* s, + const float x_scale, const float y_scale) + { + for(auto i : grid_stride_range(0, dsize)) + { + const int idx = i%dchan_size; + const int channel = i/dchan_size; + const int sidx = channel*schan_size; + const int r = idx/dnc; + const int c = idx%dnc; + + const float y = r*y_scale; + const int top = static_cast(::floorf(y)); + const int bottom = ::min(top+1, snr-1); + const float tb_frac = y - top; + + const float x = c*x_scale; + const int left = static_cast(::floorf(x)); + const int right = ::min(left+1, snc-1); + const float lr_frac = x - left; + + float tl = s[sidx+top*snc+left]; + float tr = s[sidx+top*snc+right]; + float bl = s[sidx+bottom*snc+left]; + float br = s[sidx+bottom*snc+right]; + + float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) + + tb_frac*((1-lr_frac)*bl + lr_frac*br); + + d[i] = temp; + } + } + + __global__ void _cuda_resize_bilinear_strided(size_t dsize, size_t dchan_size, size_t dnc, float* d, + size_t schan_size, int snr, int snc, const float* s, + const float x_scale, const float y_scale, + size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided + ) + { + for(auto i : grid_stride_range(0, dsize)) + { + const int idx = i%dchan_size; + const int channel = i/dchan_size; + const int sidx = channel*schan_size; + const int r = idx/dnc; + const int c = idx%dnc; + const int didx = channel*dest_chan_size_strided + r*dest_row_stride+c; + + const float y = r*y_scale; + const int top = static_cast(::floorf(y)); + const int bottom = ::min(top+1, snr-1); + const float tb_frac = y - top; + + const float x = c*x_scale; + const int left = static_cast(::floorf(x)); + const int right = ::min(left+1, snc-1); + const float lr_frac = x - left; + + float tl = s[sidx+top*src_row_stride+left]; + float tr = s[sidx+top*src_row_stride+right]; + float bl = s[sidx+bottom*src_row_stride+left]; + float br = s[sidx+bottom*src_row_stride+right]; + + float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) + + tb_frac*((1-lr_frac)*bl + lr_frac*br); + + d[didx] = temp; + } + } + + void resize_bilinear ( + tensor& dest, + long long dest_row_stride, + long long dest_channel_stride, + const tensor& src, + long long src_row_stride, + long long src_channel_stride + ) + { + DLIB_CASSERT(is_same_object(dest, src)==false); + DLIB_CASSERT(dest.num_samples() == src.num_samples()); + DLIB_CASSERT(dest.k() == src.k()); + + if (dest.size() == 0 || src.size() == 0) + return; + + const float x_scale = (src.nc()-1)/(float)std::max((dest.nc()-1),1); + const float y_scale = (src.nr()-1)/(float)std::max((dest.nr()-1),1); + + if (dest.nc() == dest_row_stride && dest.nr()*dest.nc()==dest_channel_stride && + src.nc() == src_row_stride && src.nr()*src.nc()==src_channel_stride) + { + launch_kernel(_cuda_resize_bilinear, + dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(), + src.nr()*src.nc(), src.nr(), src.nc(), src.device(), + x_scale, y_scale); + } + else + { + launch_kernel(_cuda_resize_bilinear_strided, + dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(), + src_channel_stride, src.nr(), src.nc(), src.device(), + x_scale, y_scale, dest_row_stride, src_row_stride, dest_channel_stride); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_resize_bilinear_gradient(size_t dsize, size_t dchan_size, size_t dnc, const float* d, + size_t schan_size, int snr, int snc, float* s, + const float x_scale, const float y_scale) + { + for(auto i : grid_stride_range(0, dsize)) + { + const float tmp = d[i]; + + const int idx = i%dchan_size; + const int channel = i/dchan_size; + const int sidx = channel*schan_size; + const int r = idx/dnc; + const int c = idx%dnc; + + const float y = r*y_scale; + const int top = static_cast(::floorf(y)); + const int bottom = ::min(top+1, snr-1); + const float tb_frac = y - top; + + const float x = c*x_scale; + const int left = static_cast(::floorf(x)); + const int right = ::min(left+1, snc-1); + const float lr_frac = x - left; + + + atomicAdd(s+sidx+top*snc+left, tmp*(1-tb_frac)*(1-lr_frac)); + atomicAdd(s+sidx+top*snc+right, tmp*(1-tb_frac)*(lr_frac)); + atomicAdd(s+sidx+bottom*snc+left, tmp*(tb_frac)*(1-lr_frac)); + atomicAdd(s+sidx+bottom*snc+right, tmp*(tb_frac)*(lr_frac)); + } + } + + __global__ void _cuda_resize_bilinear_gradient_strided(size_t dsize, size_t dchan_size, size_t dnc, const float* d, + size_t schan_size, int snr, int snc, float* s, + const float x_scale, const float y_scale, + size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided + ) + { + for(auto i : grid_stride_range(0, dsize)) + { + + const int idx = i%dchan_size; + const int channel = i/dchan_size; + const int didx = channel*dest_chan_size_strided; + const int sidx = channel*schan_size; + const int r = idx/dnc; + const int c = idx%dnc; + + const float tmp = d[didx + r*dest_row_stride+c]; + + const float y = r*y_scale; + const int top = static_cast(::floorf(y)); + const int bottom = ::min(top+1, snr-1); + const float tb_frac = y - top; + + const float x = c*x_scale; + const int left = static_cast(::floorf(x)); + const int right = ::min(left+1, snc-1); + const float lr_frac = x - left; + + + atomicAdd(s+sidx+top*src_row_stride+left, tmp*(1-tb_frac)*(1-lr_frac)); + atomicAdd(s+sidx+top*src_row_stride+right, tmp*(1-tb_frac)*(lr_frac)); + atomicAdd(s+sidx+bottom*src_row_stride+left, tmp*(tb_frac)*(1-lr_frac)); + atomicAdd(s+sidx+bottom*src_row_stride+right, tmp*(tb_frac)*(lr_frac)); + } + } + + void resize_bilinear_gradient ( + tensor& grad, + long long grad_row_stride, + long long grad_channel_stride, + const tensor& gradient_input, + long long gradient_input_row_stride, + long long gradient_input_channel_stride + ) + { + DLIB_CASSERT(is_same_object(grad, gradient_input)==false); + DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples()); + DLIB_CASSERT(gradient_input.k() == grad.k()); + + if (grad.size() == 0 || gradient_input.size() == 0) + return; + + const float x_scale = (grad.nc()-1)/(float)std::max((gradient_input.nc()-1),1); + const float y_scale = (grad.nr()-1)/(float)std::max((gradient_input.nr()-1),1); + + if (grad.nc() == grad_row_stride && grad.nr()*grad.nc()==grad_channel_stride && + gradient_input.nc() == gradient_input_row_stride && gradient_input.nr()*gradient_input.nc()==gradient_input_channel_stride) + { + launch_kernel(_cuda_resize_bilinear_gradient, + gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(), + grad.nr()*grad.nc(), grad.nr(), grad.nc(), grad.device(), + x_scale, y_scale); + } + else + { + launch_kernel(_cuda_resize_bilinear_gradient_strided, + gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(), + grad_channel_stride, grad.nr(), grad.nc(), grad.device(), + x_scale, y_scale, gradient_input_row_stride, grad_row_stride, gradient_input_channel_stride); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_reorg(size_t dsize, size_t dk, size_t dnr, size_t dnc, float* d, + size_t sk, size_t snr, int snc, const float* s, + const size_t row_stride, const size_t col_stride, const bool add_to) + { + const auto out_plane_size = dnr * dnc; + const auto out_sample_size = dk * out_plane_size; + for (auto i : grid_stride_range(0, dsize)) + { + const auto n = i / out_sample_size; + const auto out_idx = i % out_sample_size; + const auto out_k = out_idx / out_plane_size; + const auto out_rc = out_idx % out_plane_size; + const auto out_r = out_rc / dnc; + const auto out_c = out_rc % dnc; + + const auto in_k = out_k % sk; + const auto in_r = out_r * row_stride + (out_k / sk) / col_stride; + const auto in_c = out_c * col_stride + (out_k / sk) % col_stride; + + const auto in_idx = ((n * sk + in_k) * snr + in_r) * snc + in_c; + if (add_to) d[i] += s[in_idx]; + else d[i] = s[in_idx]; + } + } + + __global__ void _cuda_reorg_gradient(size_t ssize, size_t dk, size_t dnr, size_t dnc, float* d, + size_t sk, size_t snr, int snc, const float* s, const size_t row_stride, + const size_t col_stride, const bool add_to + ) + { + for(auto i : grid_stride_range(0, ssize)) + { + const auto n = i / (sk * snr * snc); + const auto sample_idx = i % (sk * snr * snc); + const auto in_k = (sample_idx / (snr * snc)) % sk; + const auto in_r = (sample_idx / snc) % snr; + const auto in_c = sample_idx % snc; + + const auto out_k = in_k % dk; + const auto out_r = in_r * row_stride + (in_k / dk) / col_stride; + const auto out_c = in_c * col_stride + (in_k / dk) % col_stride; + const auto out_idx = ((n * dk + out_k) * dnr + out_r) * dnc + out_c; + + if (add_to) d[out_idx] += s[i]; + else d[out_idx] = s[i]; + } + } + + void reorg( + bool add_to, + tensor& dest, + const int row_stride, + const int col_stride, + const tensor& src + ) + { + DLIB_CASSERT(!is_same_object(dest, src), "Destination and source must be distinct objects."); + DLIB_CASSERT(src.nr() % row_stride == 0, "The number of rows in src must be divisible by row_stride."); + DLIB_CASSERT(src.nc() % col_stride == 0, "The number of columns in src must be divisible by col_stride."); + DLIB_CASSERT(dest.num_samples() == src.num_samples(), "The number of samples must match."); + DLIB_CASSERT(dest.k() == src.k() * row_stride * col_stride, "The number of channels must match."); + DLIB_CASSERT(dest.nr() == src.nr() / row_stride, "The number of rows must match."); + DLIB_CASSERT(dest.nc() == src.nc() / col_stride, "The number of columns must match."); + + launch_kernel(_cuda_reorg, dest.size(), dest.k(), dest.nr(), dest.nc(), dest.device(), + src.k(), src.nr(), src.nc(), src.device(), row_stride, col_stride, add_to); + } + + void reorg_gradient( + bool add_to, + tensor& grad, + const int row_stride, + const int col_stride, + const tensor& gradient_input + ) + { + DLIB_CASSERT(!is_same_object(grad, gradient_input), "Grad and gradient_input must be distinct objects."); + DLIB_CASSERT(grad.nr() % row_stride == 0, "The number of rows in grad must be divisible by row_stride."); + DLIB_CASSERT(grad.nc() % col_stride == 0, "The number of columns in grad must be divisible by col_stride."); + DLIB_CASSERT(grad.num_samples() == gradient_input.num_samples(), "The number of samples in grad and gradient_input must match."); + DLIB_CASSERT(grad.k() == gradient_input.k() / row_stride / col_stride, "The number of channels in grad must be gradient_input.k() divided by row_stride and col_stride."); + DLIB_CASSERT(grad.nr() == gradient_input.nr() * row_stride, "The number of rows in grad must be gradient_input.nr() multiplied by row_stride."); + DLIB_CASSERT(grad.nc() == gradient_input.nc() * col_stride, "The number of columns in grad must be gradient_input.nc() multiplied by col_stride."); + + launch_kernel(_cuda_reorg_gradient, gradient_input.size(), grad.k(), grad.nr(), grad.nc(), grad.device(), + gradient_input.k(), gradient_input.nr(), gradient_input.nc(), gradient_input.device(), + row_stride, col_stride, add_to); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_embeddings(size_t dsize, size_t dk, size_t dr, size_t dc, + float* d, const float* s, const float* e, size_t es + ) + { + for (auto i : grid_stride_range(0, dsize)) + { + const auto n = i / (dk * dr * dc); + const auto s_idx = i % (dk * dr * dc); + const auto k = (s_idx / (dr * dc)) % dk; + const auto r = (s_idx / dc) % dr; + const auto c = s_idx % dc; + + const unsigned long t_idx = static_cast(s[(n * dk + k) * dr + r]); + + if (t_idx < es) + d[i] = e[t_idx * dc + c]; + else + d[i] = 0.0f; + } + } + + void embeddings( + resizable_tensor& dest, + const tensor& src, + const tensor& embs + ) + { + DLIB_CASSERT( + src.nr() > 0 && + embs.num_samples() > 0 && + embs.k() > 0 && + embs.nr() == 1 && + embs.nc() == 1, + "\nsrc.num_samples(): " << src.num_samples() << + "\nsrc.k(): " << src.k() << + "\nsrc.nr(): " << src.nr() << + "\nsrc.nc(): " << src.nc() << + "\nembs.num_samples(): " << embs.num_samples() << + "\nembs.k(): " << embs.k() << + "\nembs.nr(): " << embs.nr() << + "\nembs.nc(): " << embs.nc() + ); + + const long dk = dest.k(); + const long dr = dest.nr(); + const long dc = dest.nc(); + + launch_kernel(_cuda_embeddings, dest.size(), dk, dr, dc, + dest.device(), src.device(), embs.device(), embs.num_samples()); + } + + __global__ void _cuda_embeddings_gradient(size_t ssize, size_t sk, size_t sr, size_t sc, + const float* o, const float* gi, float* g, const float* f, float lr, bool sl, size_t es + ) + { + for (auto i : grid_stride_range(0, ssize)) + { + const auto n = i / (sk * sr * sc); + const auto s_idx = i % (sk * sr * sc); + const auto k = (s_idx / (sr * sc)) % sk; + const auto r = (s_idx / sc) % sr; + const auto c = s_idx % sc; + + const unsigned long t_idx = static_cast(o[(n * sk + k) * sr + r]); + if (t_idx < es) + { + const float f_t = f[t_idx]; + float f_s = 1.0f; + + if (sl && f_t != 0.0f) f_s = fminf(0.15f, fmaxf(1.0f / f_t, 1.0f)); + if (f_t > 1) atomicAdd(&g[t_idx * sc + c], -gi[i] * lr * f_s); + else g[t_idx * sc + c] -= gi[i] * lr * f_s; + } + } + } + + void embeddings_gradient( + const tensor& prev, + const tensor& gradient_input, + tensor& grads, + const tensor& freqs, + float learning_rate, + bool scale + ) + { + DLIB_CASSERT( + prev.nr() > 0 && + gradient_input.num_samples() == prev.num_samples() && + gradient_input.k() == prev.k() && + gradient_input.nr() == prev.nr() && + gradient_input.nc() == grads.k() && + grads.num_samples() > 0 && + grads.k() > 0 && + grads.nr() == 1 && + grads.nc() == 1, + "\ngradient_input.num_samples(): " << gradient_input.num_samples() << + "\ngradient_input.k(): " << gradient_input.k() << + "\ngradient_input.nr(): " << gradient_input.nr() << + "\ngradient_input.nc(): " << gradient_input.nc() << + "\nprev.num_samples(): " << prev.num_samples() << + "\nprev.k(): " << prev.k() << + "\nprev.nr(): " << prev.nr() << + "\nprev.nc(): " << prev.nc() << + "\ngrads.num_samples(): " << grads.num_samples() << + "\ngrads.k(): " << grads.k() << + "\ngrads.nr(): " << grads.nr() << + "\ngrads.nc(): " << grads.nc() + ); + + const long sk = gradient_input.k(); + const long sr = gradient_input.nr(); + const long sc = gradient_input.nc(); + + launch_kernel(_cuda_embeddings_gradient, gradient_input.size(), sk, sr, sc, + prev.device(), gradient_input.device(), grads.device(), freqs.device(), + learning_rate, scale, grads.num_samples()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_layer_normalize_accumulate( + float* m, + float* v, + const float* s, + size_t ns, + size_t k, + size_t num + ) + { + for (auto n : grid_stride_range_y(0, ns)) + { + const auto ps = s + n * k * num; + float means = 0; + float invstds = 0; + for (auto i : grid_stride_range(0, k* num)) + { + means += ps[i]; + invstds += ps[i] * ps[i]; + } + warp_reduce_atomic_add(m[n], means / (k * num)); + warp_reduce_atomic_add(v[n], invstds / (k * num)); + } + } + + __global__ void _cuda_layer_normalize_invert( + float* m, + float* v, + float eps, + size_t ns + ) + { + for (auto n : grid_stride_range_y(0, ns)) + { + if (threadIdx.x == 0) + v[n] = 1.0f / std::sqrt(v[n] - m[n] * m[n] + eps); + } + } + + __global__ void _cuda_layer_normalize_apply( + float* out, + const float* s, + const float* m, + const float* v, + const float* g, + const float* b, + size_t ns, + size_t k, + size_t num + ) + { + for (auto n : grid_stride_range_y(0, ns)) + { + const auto ps = s + n * k * num; + const auto pout = out + n * k * num; + for (auto i : grid_stride_range(0, k* num)) + { + pout[i] = (ps[i] - m[n]) * v[n]; + pout[i] = pout[i] * g[i / num] + b[i / num]; + } + } + } + + void layer_normalize( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const tensor& src, + const tensor& gamma, + const tensor& beta + ) + { + const long num = src.nr() * src.nc(); + DLIB_CASSERT( + have_same_dimensions(gamma, beta) && + gamma.k() == src.k() && + gamma.nr() == 1 && + gamma.nc() == 1 && + eps > 0, + "\nsrc.k(): " << src.k() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\nbeta.k(): " << beta.k() << + "\nbeta.nr(): " << beta.nr() << + "\nbeta.nc(): " << beta.nc() << + "\neps: " << eps + ); + + const long ns = src.num_samples(); + const long ks = src.k(); + + dest.copy_size(src); + means.set_size(ns); + invstds.set_size(ns); + means = 0; + invstds = 0; + + launch_kernel(_cuda_layer_normalize_accumulate, max_jobs(ks * num, ns), + means.device(), invstds.device(), src.device(), ns, ks, num); + + launch_kernel(_cuda_layer_normalize_invert, max_jobs(1, ns), + means.device(), invstds.device(), eps, ns); + + launch_kernel(_cuda_layer_normalize_apply, max_jobs(ks * num, ns), + dest.device(), src.device(), means.device(), invstds.device(), + gamma.device(), beta.device(), ns, ks, num); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_layer_normalize_gradient_accumulate( + float* bg, + float* gg, + float* dv, + const float* s, + const float* gi, + const float* m, + const float* v, + const float* g, + size_t ns, + size_t ks, + size_t num + ) + { + for (auto nk : grid_stride_range_y(0, ns* ks)) + { + const auto n = nk / ks; + const auto k = nk % ks; + const auto ps = s + (n * ks + k) * num; + const auto pgi = gi + (n * ks + k) * num; + const float invstd_pow = -0.5f * std::pow(v[n], 3.0f); + float temp_bg = 0; + float temp_gg = 0; + float temp_dv = 0; + for (auto i : grid_stride_range(0, num)) + { + const float x_hat = (ps[i] - m[n]) * v[n]; + const float dx = pgi[i] * g[k]; + temp_bg += pgi[i]; + temp_gg += pgi[i] * x_hat; + temp_dv += dx * (ps[i] - m[n]) * invstd_pow; + } + warp_reduce_atomic_add(bg[k], temp_bg); + warp_reduce_atomic_add(gg[k], temp_gg); + warp_reduce_atomic_add(dv[n], temp_dv); + } + } + + __global__ void _cuda_layer_normalize_gradient_compute_dm( + float* dm, + const float* dv, + const float* s, + const float* gi, + const float* m, + const float* v, + const float* g, + size_t ns, + size_t ks, + size_t num + ) + { + const float invnum = 1.0f / (ks * num); + for (auto n : grid_stride_range_y(0, ns)) + { + const auto ps = s + n * ks * num; + const auto pgi = gi + n * ks * num; + float temp_dm = 0; + for (auto i : grid_stride_range(0, ks* num)) + { + const float dx = pgi[i] * g[i / num]; + temp_dm += -dx * v[n] + dv[n] * -2 * (ps[i] - m[n]) * invnum; + } + warp_reduce_atomic_add(dm[n], temp_dm); + } + } + + __global__ void _cuda_layer_normalize_gradient_apply( + float* out, + const float* dm, + const float* dv, + const float* s, + const float* gi, + const float* m, + const float* v, + const float* g, + size_t ns, + size_t ks, + size_t num + ) + { + const float invnum = 1.0f / (ks * num); + for (auto n : grid_stride_range_y(0, ns)) + { + const auto ps = s + n * ks * num; + const auto pgi = gi + n * ks * num; + const auto pout = out + n * ks * num; + for (auto i : grid_stride_range(0, ks* num)) + { + const float dx = pgi[i] * g[i / num]; + pout[i] += dx * v[n] + dv[n] * 2 * (ps[i] - m[n]) * invnum + dm[n] * invnum; + } + } + } + + void layer_normalize_gradient( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad, + resizable_tensor& dmeans, + resizable_tensor& dvars + ) + { + const long num = src.nr() * src.nc(); + DLIB_CASSERT(src.num_samples() == means.size()); + DLIB_CASSERT(src.num_samples() == invstds.size()); + DLIB_CASSERT(have_same_dimensions(gamma, gamma_grad)); + DLIB_CASSERT(have_same_dimensions(gamma_grad, beta_grad)); + DLIB_CASSERT(gamma.k() == src.k()); + DLIB_CASSERT(gamma.nr() == 1); + DLIB_CASSERT(gamma.nc() == 1); + DLIB_CASSERT(have_same_dimensions(gradient_input, src)); + DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad)); + DLIB_CASSERT(eps > 0); + + const long ns = src.num_samples(); + const long ks = src.k(); + + beta_grad = 0; + gamma_grad = 0; + dvars.copy_size(invstds); + dmeans.copy_size(means); + dvars = 0; + dmeans = 0; + + launch_kernel(_cuda_layer_normalize_gradient_accumulate, max_jobs(ks * num, ns * ks), + beta_grad.device(), gamma_grad.device(), dvars.device(), + src.device(), gradient_input.device(), means.device(), invstds.device(), + gamma.device(), ns, ks, num); + + launch_kernel(_cuda_layer_normalize_gradient_compute_dm, max_jobs(ks * num, ns), + dmeans.device(), dvars.device(), + src.device(), gradient_input.device(), means.device(), invstds.device(), + gamma.device(), ns, ks, num); + + launch_kernel(_cuda_layer_normalize_gradient_apply, max_jobs(ks * num, ns), + src_grad.device(), dmeans.device(), dvars.device(), + src.device(), gradient_input.device(), means.device(), invstds.device(), + gamma.device(), ns, ks, num); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_rms_normalize_accumulate( + float* scale, + const float* src, + size_t ns, + size_t ks, + size_t num + ) + { + for (auto n : grid_stride_range_y(0, ns)) + { + const auto ps = src + n * ks * num; + float sum_squares = 0.0f; + for (auto i : grid_stride_range(0, ks* num)) + { + sum_squares += ps[i] * ps[i]; + } + warp_reduce_atomic_add(scale[n], sum_squares / (ks * num)); + } + } + + __global__ void _cuda_rms_normalize_invert( + float* scale, + float eps, + size_t ns + ) + { + for (auto n : grid_stride_range_y(0, ns)) + { + if (threadIdx.x == 0) + scale[n] = 1.0f / std::sqrt(scale[n] + eps); + } + } + + __global__ void _cuda_rms_normalize_apply( + float* dest, + const float* scale, + const float* src, + const float* gamma, + size_t ns, + size_t ks, + size_t num + ) + { + for (auto n : grid_stride_range_y(0, ns)) + { + const auto ps = src + n * ks * num; + const auto pd = dest + n * ks * num; + for (auto i : grid_stride_range(0, ks* num)) + { + pd[i] = ps[i] * scale[n] * gamma[i / num]; + } + } + } + + void rms_normalize( + const double eps, + resizable_tensor& dest, + resizable_tensor& scale, + const tensor& src, + const tensor& gamma + ) + { + DLIB_CASSERT( + gamma.k() == src.k() && + gamma.nr() == 1 && + gamma.nc() == 1 && + eps > 0, + "\nsrc.k(): " << src.k() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\neps: " << eps + ); + + const long ns = src.num_samples(); + const long ks = src.k(); + const long num = src.nr() * src.nc(); + + dest.copy_size(src); + scale.set_size(ns); + scale = 0; + + launch_kernel(_cuda_rms_normalize_accumulate, max_jobs(ks * num, ns), + scale.device(), src.device(), ns, ks, num); + + launch_kernel(_cuda_rms_normalize_invert, max_jobs(1, ns), + scale.device(), eps, ns); + + launch_kernel(_cuda_rms_normalize_apply, max_jobs(ks * num, ns), + dest.device(), scale.device(), src.device(), gamma.device(), ns, ks, num); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_rms_normalize_gradient_accumulate( + float* gamma_grad, + float* dscale, + const float* src, + const float* gradient_input, + const float* scale, + const float* gamma, + size_t ns, + size_t ks, + size_t num + ) + { + for (auto nk : grid_stride_range_y(0, ns* ks)) + { + const auto n = nk / ks; + const auto k = nk % ks; + const auto ps = src + (n * ks + k) * num; + const auto pgi = gradient_input + (n * ks + k) * num; + const float scale_pow = -0.5f * std::pow(scale[n], 3.0f); + float temp_gg = 0.0f; + float temp_ds = 0.0f; + for (auto i : grid_stride_range(0, num)) + { + const float x_hat = ps[i] * scale[n]; + const float dx = pgi[i] * gamma[k]; + temp_gg += pgi[i] * x_hat; + temp_ds += dx * ps[i] * scale_pow; + } + warp_reduce_atomic_add(gamma_grad[k], temp_gg); + warp_reduce_atomic_add(dscale[n], temp_ds); + } + } + + __global__ void _cuda_rms_normalize_gradient_apply( + float* src_grad, + const float* dscale, + const float* src, + const float* gradient_input, + const float* scale, + const float* gamma, + size_t ns, + size_t ks, + size_t num + ) + { + const float invnum = 1.0f / (ks * num); + for (auto n : grid_stride_range_y(0, ns)) + { + const auto ps = src + n * ks * num; + const auto pgi = gradient_input + n * ks * num; + const auto psg = src_grad + n * ks * num; + for (auto i : grid_stride_range(0, ks* num)) + { + const float dx = pgi[i] * gamma[i / num]; + psg[i] += dx * scale[n] + dscale[n] * 2 * ps[i] * invnum; + } + } + } + + void rms_normalize_gradient( + const tensor& gradient_input, + const tensor& scale, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + resizable_tensor& dscale + ) + { + DLIB_CASSERT(src.num_samples() == scale.size()); + DLIB_CASSERT(have_same_dimensions(gamma, gamma_grad)); + DLIB_CASSERT(gamma.k() == src.k()); + DLIB_CASSERT(gamma.nr() == 1); + DLIB_CASSERT(gamma.nc() == 1); + DLIB_CASSERT(have_same_dimensions(gradient_input, src)); + DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad)); + + const long ns = src.num_samples(); + const long ks = src.k(); + const long num = src.nr() * src.nc(); + + gamma_grad = 0; + dscale.copy_size(scale); + dscale = 0; + + launch_kernel(_cuda_rms_normalize_gradient_accumulate, max_jobs(ks * num, ns * ks), + gamma_grad.device(), dscale.device(), + src.device(), gradient_input.device(), scale.device(), gamma.device(), + ns, ks, num); + + launch_kernel(_cuda_rms_normalize_gradient_apply, max_jobs(ks * num, ns), + src_grad.device(), dscale.device(), + src.device(), gradient_input.device(), scale.device(), gamma.device(), + ns, ks, num); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size) + { + for(auto i : grid_stride_range(0, size)) + { + size_t blk = i/block_size; + size_t j = i%block_size; + dest[blk*dest_stride + j] += src[blk*src_stride + j]; + } + } + + __global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size) + { + for(auto i : grid_stride_range(0, size)) + { + size_t blk = i/block_size; + size_t j = i%block_size; + dest[blk*dest_stride + j] = src[blk*src_stride + j]; + } + } + + void copy_tensor( + bool add_to, + tensor& dest, + size_t dest_k_offset, + const tensor& src, + size_t src_k_offset, + size_t count_k + ) + { + const size_t dest_sample_size = static_cast(dest.nc() * dest.nr() * dest.k()); + const size_t src_sample_size = static_cast(src.nc() * src.nr() * src.k()); + + const size_t block_size = count_k * dest.nc() * dest.nr(); + + DLIB_CASSERT(dest.num_samples() == src.num_samples() && + dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size"); + DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor"); + DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor"); + + float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr(); + const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();; + + if (add_to) + { + launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()), + dest_p, block_size*dest.num_samples(), + src_p, dest_sample_size, src_sample_size, block_size); + } + else + { + launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()), + dest_p, block_size*dest.num_samples(), + src_p, dest_sample_size, src_sample_size, block_size); + } + } + + __global__ void _cuda_copy_strided_tensor_add_to (float* dest, const float* src, + size_t ns, size_t nk, size_t nr, size_t nc, + size_t dk, size_t dr, size_t dc, + size_t sk, size_t sr, size_t sc) + { + for(auto i : grid_stride_range(0, ns*nk*nr*nc)) + { + size_t n,k,r,c; + unpack_idx(i, nk,nr,nc, n,k,r,c); + dest[pack_idx(dk,dr,dc, n,k,r,c)] += src[pack_idx(sk,sr,sc, n,k,r,c)]; + } + } + + __global__ void _cuda_copy_strided_tensor (float* dest, const float* src, + size_t ns, size_t nk, size_t nr, size_t nc, + size_t dk, size_t dr, size_t dc, + size_t sk, size_t sr, size_t sc) + { + for(auto i : grid_stride_range(0, ns*nk*nr*nc)) + { + size_t n,k,r,c; + unpack_idx(i, nk,nr,nc, n,k,r,c); + dest[pack_idx(dk,dr,dc, n,k,r,c)] = src[pack_idx(sk,sr,sc, n,k,r,c)]; + } + } + + void copy_tensor( + bool add_to, + tensor& dest, + size_t dk, size_t dnr, size_t dnc, + const tensor& src, + size_t sk, size_t snr, size_t snc, + size_t k, size_t nr, size_t nc + ) + { + + DLIB_CASSERT(dest.num_samples() == src.num_samples(), "All sources should fit into dest tensor size"); + DLIB_CASSERT(dest.k() - dk >= k && + dest.nr() - dnr >= nr && + dest.nc() - dnc >= nc, "Not enough space in dest tensor"); + DLIB_CASSERT(src.k() - sk >= k && + src.nr() - snr >= nr && + src.nc() - snc >= nc, "Not enough space in src tensor"); + + float* dest_p = dest.device() + dk * static_cast(dest.nc() * dest.nr()) \ + + dnr * static_cast(dest.nc()) \ + + dnc; + + const float* src_p = src.device() + sk * static_cast(src.nc() * src.nr()) \ + + snr * static_cast(src.nc()) \ + + snc; + + if (add_to) + { + launch_kernel(_cuda_copy_strided_tensor_add_to, max_jobs(dest.size()), + dest_p, src_p, dest.num_samples(), + k, nr, nc, + dest.k(), dest.nr(), dest.nc(), + src.k(), src.nr(), src.nc()); + } + else + { + launch_kernel(_cuda_copy_strided_tensor, max_jobs(dest.size()), + dest_p, src_p, dest.num_samples(), + k, nr, nc, + dest.k(), dest.nr(), dest.nc(), + src.k(), src.nr(), src.nc()); + } + } + + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_transpose(size_t dsize, size_t dk, size_t dnr, size_t dnc, float* d, + size_t sk, size_t snr, int snc, const float* s, const bool add_to) + { + const auto plane_size = dnr * dnc; + const auto sample_size = dk * plane_size; + for (auto i : grid_stride_range(0, dsize)) + { + const auto n = i / sample_size; + const auto idx = i % plane_size; + const auto in_k = (i / plane_size) % dk; + const auto in_r = idx % dnc; + const auto in_c = idx / dnc; + + const auto in_idx = ((n * sk + in_k) * snr + in_r) * snc + in_c; + if (add_to) d[i] += s[in_idx]; + else d[i] = s[in_idx]; + } + } + + void transpose( + bool add_to, + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(is_same_object(dest, src) == false); + DLIB_CASSERT(dest.num_samples() == src.num_samples() && + dest.k() == src.k() && + dest.nr() == src.nc() && + dest.nc() == src.nr(), + "Incompatible tensor dimensions."); + + launch_kernel(_cuda_transpose, max_jobs(dest.size()), dest.size(), + dest.k(), dest.nr(), dest.nc(), dest.device(), + src.k(), src.nr(), src.nc(), src.device(), add_to); + } + + // ---------------------------------------------------------------------------------------- + + // CUDA Kernels for ACT operations + + // Kernel 1: initialize logits with bias + __global__ void _cuda_act_init_logits( + float* logits, + float b_halt, + size_t total_positions + ) + { + for (auto pos : grid_stride_range(0, total_positions)) + logits[pos] = b_halt; + } + + // Kernel 2: compute dot product and accumulate into logits + __global__ void _cuda_act_accumulate_logits( + float* logits, + const float* input_data, + const float* W_halt, + size_t batch_size, + size_t seq_len, + size_t d_model, + size_t num_channels, + size_t feature_dim + ) + { + const long total_positions = batch_size * seq_len; + + for (auto pos : grid_stride_range_y(0, total_positions)) + { + const long n = pos / seq_len; + const long s = pos % seq_len; + + float temp = 0; + for (auto feat_idx : grid_stride_range(0, feature_dim)) + { + const long c = feat_idx / d_model; + const long d = feat_idx % d_model; + + const long in_idx = ((n * num_channels + c) * seq_len + s) * d_model + d; + temp += input_data[in_idx] * W_halt[feat_idx]; + } + + warp_reduce_atomic_add(logits[pos], temp); + } + } + + // Kernel 3: apply sigmoid to compute halt probabilities + __global__ void _cuda_act_apply_sigmoid( + float* halt_probs, + const float* logits, + size_t total_positions + ) + { + for (auto pos : grid_stride_range(0, total_positions)) + halt_probs[pos] = 1.0f / (1.0f + expf(-logits[pos])); + } + + void compute_act_halt_probabilities( + resizable_tensor& halt_probs, + resizable_tensor& logits, + const tensor& input_data, + const tensor& halt_params, + long batch_size, + long seq_len, + long feature_dim + ) + { + const long total_positions = batch_size * seq_len; + const long d_model = feature_dim / input_data.k(); + const long num_channels = input_data.k(); + + halt_probs.set_size(total_positions, 1, 1, 1); + logits.set_size(total_positions, 1, 1, 1); + + // Extract bias from halt_params (last element) + const float b_halt = halt_params.host()[feature_dim]; + + // Phase 1: initialize logits with bias + launch_kernel(_cuda_act_init_logits, + max_jobs(total_positions), + logits.device(), + b_halt, + total_positions); + + // Phase 2: accumulate dot product into logits + // Note: sequential kernel launch provides implicit synchronization + launch_kernel(_cuda_act_accumulate_logits, + max_jobs(feature_dim, total_positions), + logits.device(), + input_data.device(), + halt_params.device(), + batch_size, + seq_len, + d_model, + num_channels, + feature_dim); + + // Phase 3: apply sigmoid + // Note: sequential kernel launch provides implicit synchronization + launch_kernel(_cuda_act_apply_sigmoid, + max_jobs(total_positions), + halt_probs.device(), + logits.device(), + total_positions); + } + + __global__ void _cuda_update_act_state( + float* output, + const float* input_data, + const float* halt_probs, + float* cumulative_halting, + float* remainders, + float* n_steps, + float* effective_weights, + size_t batch_size, + size_t seq_len, + size_t d_model, + size_t num_channels, + float halt_threshold, + long current_step + ) + { + for (auto pos : grid_stride_range(0, batch_size * seq_len)) + { + if (cumulative_halting[pos] < halt_threshold) + { + const size_t n = pos / seq_len; + const size_t s = pos % seq_len; + + float p = halt_probs[pos]; + float r = remainders[pos]; + float effective = fminf(p * r, halt_threshold - cumulative_halting[pos]); + + cumulative_halting[pos] += effective; + remainders[pos] -= effective; + n_steps[pos] = static_cast(current_step + 1); + effective_weights[pos] += effective; + + for (size_t c = 0; c < num_channels; ++c) { + for (size_t d = 0; d < d_model; ++d) { + const size_t idx = ((n * num_channels + c) * seq_len + s) * d_model + d; + output[idx] += effective * input_data[idx]; + } + } + } + } + } + + void update_act_state( + resizable_tensor& output, + const tensor& input_data, + const tensor& halt_probs, + resizable_tensor& cumulative_halting, + resizable_tensor& remainders, + resizable_tensor& n_steps, + resizable_tensor& effective_weights, + long batch_size, + long seq_len, + long d_model, + long num_channels, + float halt_threshold, + long current_step + ) + { + const long total_positions = batch_size * seq_len; + + launch_kernel(_cuda_update_act_state, + max_jobs(total_positions), + output.device(), + input_data.device(), + halt_probs.device(), + cumulative_halting.device(), + remainders.device(), + n_steps.device(), + effective_weights.device(), + batch_size, + seq_len, + d_model, + num_channels, + halt_threshold, + current_step); + } + + __global__ void _cuda_finalize_act_output( + float* output, + const float* input_data, + const float* remainders, + float* effective_weights, + size_t batch_size, + size_t seq_len, + size_t d_model, + size_t num_channels + ) + { + for (auto pos : grid_stride_range(0, batch_size * seq_len)) + { + float r = remainders[pos]; + if (r > 1e-6f) { + const size_t n = pos / seq_len; + const size_t s = pos % seq_len; + + effective_weights[pos] += r; + + for (size_t c = 0; c < num_channels; ++c) { + for (size_t d = 0; d < d_model; ++d) { + const size_t idx = ((n * num_channels + c) * seq_len + s) * d_model + d; + output[idx] += r * input_data[idx]; + } + } + } + } + } + + void finalize_act_output( + resizable_tensor& output, + const tensor& input_data, + const tensor& remainders, + resizable_tensor& effective_weights, + long batch_size, + long seq_len, + long d_model, + long num_channels + ) + { + const long total_positions = batch_size * seq_len; + + launch_kernel(_cuda_finalize_act_output, + max_jobs(total_positions), + output.device(), + input_data.device(), + remainders.device(), + effective_weights.device(), + batch_size, + seq_len, + d_model, + num_channels); + } + + __global__ void _cuda_apply_act_depth_scaling( + float* gradients, + const float* n_steps, + size_t batch_size, + size_t seq_len, + size_t d_model, + size_t num_channels, + float max_steps, + float scale_factor + ) + { + const long total_positions = batch_size * seq_len; + const long feature_dim = num_channels * d_model; + + for (auto pos : grid_stride_range_y(0, total_positions)) + { + const long n = pos / seq_len; + const long s = pos % seq_len; + const float scale = 1.0f + scale_factor * (n_steps[pos] / max_steps); + + for (auto feat_idx : grid_stride_range(0, feature_dim)) + { + const long c = feat_idx / d_model; + const long d = feat_idx % d_model; + const long idx = ((n * num_channels + c) * seq_len + s) * d_model + d; + gradients[idx] *= scale; + } + } + } + + void apply_act_depth_scaling( + tensor& gradients, + const tensor& n_steps, + long batch_size, + long seq_len, + long d_model, + long num_channels, + float max_steps, + float scale_factor + ) + { + const long total_positions = batch_size * seq_len; + const long feature_dim = num_channels * d_model; + + launch_kernel(_cuda_apply_act_depth_scaling, + max_jobs(feature_dim, total_positions), + gradients.device(), + n_steps.device(), + batch_size, + seq_len, + d_model, + num_channels, + max_steps, + scale_factor); + } + + // ---------------------------------------------------------------------------------------- + + + __device__ float cuda_log1pexp(float x) + { + if (x <= -18) + return std::exp(x); + else if (-18 < x && x <= 9) + return std::log1pf(std::exp(x)); + else if (9 < x && x <= 16) + return x + expf(-x); + else + return x; + } + + __global__ void _cuda_compute_loss_binary_log_per_pixel(float* loss_out, float* g, const float* truth, const float* out_data, size_t n, const float scale) + { + float loss = 0; + for(auto i : grid_stride_range(0, n)) + { + const float y = truth[i]; + + if (y > 0.f) + { + const float temp = cuda_log1pexp(-out_data[i]); + loss += y*temp; + g[i] = y*scale*(g[i]-1); + } + else if (y < 0.f) + { + const float temp = -(-out_data[i]-cuda_log1pexp(-out_data[i])); + loss += -y*temp; + g[i] = -y*scale*g[i]; + } + else + { + g[i] = 0.f; + } + } + + warp_reduce_atomic_add(*loss_out, loss); + } + + // ---------------------------------------------------------------------------------------- + + __device__ float cuda_safe_log(float x, float epsilon = 1e-10) + { + // Prevent trying to calculate the logarithm of a very small number (let alone zero) + if (x >= epsilon) + return ::log(x); + else + return ::log(epsilon); + } + + __global__ void _cuda_compute_loss_multiclass_log_per_pixel(float* loss_out, float* g, const uint16_t* truth, size_t n, size_t plane_size, size_t sample_size, size_t nk, uint16_t label_to_ignore, const float scale) + { + float loss = 0; + for(auto i : grid_stride_range(0, n)) + { + const size_t k = (i/plane_size)%nk; + const size_t idx = (i%plane_size) + plane_size*(i/sample_size); + + const size_t y = truth[idx]; + + if (k == y) + { + loss -= cuda_safe_log(g[i]); + g[i] = scale*(g[i] - 1); + } + else if (y == label_to_ignore) + { + g[i] = 0.f; + } + else + { + g[i] = scale*g[i]; + } + } + + warp_reduce_atomic_add(*loss_out, loss); + } + + __global__ void _cuda_compute_loss_multiclass_log_per_pixel_weighted(float* loss_out, float* g, const uint16_t* truth, size_t n, size_t plane_size, size_t sample_size, size_t nk, const float* weights, const float scale) + { + float loss = 0; + for(auto i : grid_stride_range(0, n)) + { + const size_t k = (i/plane_size)%nk; + const size_t idx = (i%plane_size) + plane_size*(i/sample_size); + + const size_t y = truth[idx]; + const float weight = weights[idx]; + + if (k == y) + { + loss -= weight*cuda_safe_log(g[i]); + g[i] = weight*scale*(g[i] - 1); + } + else + { + g[i] = weight*scale*g[i]; + } + } + + warp_reduce_atomic_add(*loss_out, loss); + } + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_compute_loss_mean_squared_per_channel_and_pixel(float* loss_out, float* g, const float* truth, const float* out_data, size_t n, const float scale) + { + float loss = 0; + for (auto i : grid_stride_range(0, n)) + { + const float y = truth[i]; + const float temp = y - out_data[i]; + loss += temp * temp; + g[i] = -temp * scale; + } + warp_reduce_atomic_add(*loss_out, loss); + } + + // ---------------------------------------------------------------------------------------- + + void compute_loss_binary_log_per_pixel:: + do_work( + cuda_data_ptr loss_work_buffer, + cuda_data_ptr truth_buffer, + const tensor& subnetwork_output, + tensor& gradient, + double& loss + ) + { + CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float))); + sigmoid(gradient, subnetwork_output); + + // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. + const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc()); + + launch_kernel(_cuda_compute_loss_binary_log_per_pixel, max_jobs(gradient.size()), + loss_work_buffer.data(), gradient.device(), truth_buffer.data(), subnetwork_output.device(), gradient.size(), scale); + + float floss; + dlib::cuda::memcpy(&floss, loss_work_buffer); + loss = scale*floss; + } + + void compute_loss_multiclass_log_per_pixel:: + do_work( + cuda_data_ptr loss_work_buffer, + cuda_data_ptr truth_buffer, + const tensor& subnetwork_output, + tensor& gradient, + double& loss + ) + { + CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float))); + softmax(gradient, subnetwork_output); + static const uint16_t label_to_ignore = std::numeric_limits::max(); + + // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. + const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc()); + + launch_kernel(_cuda_compute_loss_multiclass_log_per_pixel, max_jobs(gradient.size()), + loss_work_buffer.data(), gradient.device(), truth_buffer.data(), gradient.size(), gradient.nr()*gradient.nc(), gradient.nr()*gradient.nc()*gradient.k(), gradient.k(), label_to_ignore, scale); + + float floss; + dlib::cuda::memcpy(&floss, loss_work_buffer); + loss = scale*floss; + } + + void compute_loss_multiclass_log_per_pixel_weighted:: + do_work( + cuda_data_ptr loss_work_buffer, + cuda_data_ptr truth_buffer, + cuda_data_ptr weights_buffer, + const tensor& subnetwork_output, + tensor& gradient, + double& loss + ) + { + CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float))); + softmax(gradient, subnetwork_output); + + // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. + const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc()); + + launch_kernel(_cuda_compute_loss_multiclass_log_per_pixel_weighted, max_jobs(gradient.size()), + loss_work_buffer.data(), gradient.device(), truth_buffer.data(), gradient.size(), gradient.nr()*gradient.nc(), gradient.nr()*gradient.nc()*gradient.k(), gradient.k(), weights_buffer.data(), scale); + + float floss; + dlib::cuda::memcpy(&floss, loss_work_buffer); + loss = scale*floss; + } + + void compute_loss_mean_squared_per_channel_and_pixel:: + do_work( + cuda_data_ptr loss_work_buffer, + cuda_data_ptr truth_buffer, + const tensor& subnetwork_output, + tensor& gradient, + double& loss + ) + { + CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float))); + + // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. + const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.k() * subnetwork_output.nr() * subnetwork_output.nc()); + + launch_kernel(_cuda_compute_loss_mean_squared_per_channel_and_pixel , max_jobs(gradient.size()), + loss_work_buffer.data(), gradient.device(), truth_buffer.data(), subnetwork_output.device(), gradient.size(), scale); + + float floss; + dlib::cuda::memcpy(&floss, loss_work_buffer); + loss = scale*floss; + } + + // ---------------------------------------------------------------------------------------- + + } +} + diff --git a/dlib/cuda/cuda_dlib.h b/dlib/cuda/cuda_dlib.h index 3c60662b76..11c700a557 100644 --- a/dlib/cuda/cuda_dlib.h +++ b/dlib/cuda/cuda_dlib.h @@ -658,6 +658,65 @@ namespace dlib float scale_factor ); + // ---------------------------------------------------------------------------------------- + + void apply_rotary_positional_embedding( + bool is_backward, + tensor& data, + const tensor& cos_cache, + const tensor& sin_cache + ); + + // ---------------------------------------------------------------------------------------- + + class compute_loss_cross_entropy_per_logit + { + /*! + The point of this class is to compute the loss computed by + loss_cross_entropy_per_logit_, but to do so with CUDA + !*/ + public: + compute_loss_cross_entropy_per_logit() {} + + template + void operator() ( + const_label_iterator truth, + const tensor& input_tensor, // Source tokens + const tensor& subnetwork_output, // Logits + tensor& gradient, + double& loss, + long ignore_index + ) const + { + const size_t bytes_per_sample = sizeof(unsigned long); + buf = device_global_buffer(subnetwork_output.num_samples() * bytes_per_sample + sizeof(float)); + cuda_data_ptr loss_buf = static_pointer_cast(buf, 1); + buf = buf + sizeof(float); + + for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) + { + const unsigned long t = *truth; + memcpy(buf + i * bytes_per_sample, &t, bytes_per_sample); + } + + auto truth_buf = static_pointer_cast(buf, subnetwork_output.num_samples()); + do_work(loss_buf, truth_buf, input_tensor, subnetwork_output, gradient, loss, ignore_index); + } + + private: + static void do_work( + cuda_data_ptr loss_work_buffer, + cuda_data_ptr truth_buffer, + const tensor& input_tensor, + const tensor& subnetwork_output, + tensor& gradient, + double& loss, + long ignore_index + ); + + mutable cuda_data_void_ptr buf; + }; + // ---------------------------------------------------------------------------------------- class compute_loss_binary_log_per_pixel diff --git a/dlib/cuda/tensor.h b/dlib/cuda/tensor.h index 6a893df311..138413b642 100644 --- a/dlib/cuda/tensor.h +++ b/dlib/cuda/tensor.h @@ -220,6 +220,17 @@ namespace dlib t.size() == (size_t)t.nc(); } +// ---------------------------------------------------------------------------------------- + + inline bool is_2d_matrix( + const tensor& t + ) + { + return !is_vector(t) && + (t.size() == (size_t)(t.num_samples() * t.k()) || + t.size() == (size_t)(t.nr() * t.nc())); + } + // ---------------------------------------------------------------------------------------- inline const matrix_op > mat ( diff --git a/dlib/cuda/tensor_abstract.h b/dlib/cuda/tensor_abstract.h index 62f649391e..3a3d83eda7 100644 --- a/dlib/cuda/tensor_abstract.h +++ b/dlib/cuda/tensor_abstract.h @@ -359,6 +359,18 @@ namespace dlib - t.size() == t.nc() !*/ +// ---------------------------------------------------------------------------------------- + + inline bool is_2d_matrix( + const tensor& t + ); + /*! + ensures + - returns true if and only if one of the following is true: + - t.size() == t.num_samples() * t.k() + - t.size() == t.nr() * t.nc() + !*/ + // ---------------------------------------------------------------------------------------- const matrix_exp mat ( diff --git a/dlib/cuda/tensor_tools.cpp b/dlib/cuda/tensor_tools.cpp index 6340dfe7d3..ea32dc9426 100644 --- a/dlib/cuda/tensor_tools.cpp +++ b/dlib/cuda/tensor_tools.cpp @@ -242,39 +242,54 @@ namespace dlib { namespace tt } else if (mode == operation_mode::PLANE_WISE) { - auto is_matrix = [](const auto& tensor) { - return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) || - (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1)); - }; - - long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() }); - long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() }); - const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest); + const bool lhs_is_matrix = is_2d_matrix(lhs); + const bool rhs_is_matrix = is_2d_matrix(rhs); + const bool dest_is_matrix = is_2d_matrix(dest); + + const size_t lhs_plane_size = lhs.nr() * lhs.nc(); + const size_t rhs_plane_size = rhs.nr() * rhs.nc(); + const size_t dest_plane_size = dest.nr() * dest.nc(); + + long num_samples, num_channels = std::min({ lhs.k(), rhs.k(), dest.k() }); + if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) + num_samples = 1; + else if (!lhs_is_matrix && rhs_is_matrix) + num_samples = lhs.num_samples(); + else + num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() }); - if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) { - num_samples = num_channels = 1; + size_t lhs_rows = lhs.nr(); + size_t lhs_cols = lhs.nc(); + if (lhs_is_matrix && (lhs.num_samples() > 1 || lhs.k() > 1)) { + lhs_rows = lhs.num_samples(); + lhs_cols = lhs.k(); + } + size_t rhs_rows = rhs.nr(); + size_t rhs_cols = rhs.nc(); + if (rhs_is_matrix && (rhs.num_samples() > 1 || rhs.k() > 1)) { + rhs_rows = rhs.num_samples(); + rhs_cols = rhs.k(); + } + size_t dest_rows = dest.nr(); + size_t dest_cols = dest.nc(); + if (dest_is_matrix && (dest.num_samples() > 1 || dest.k() > 1)) { + dest_rows = dest.num_samples(); + dest_cols = dest.k(); } - long lhs_rows = (lhs_is_matrix && lhs.num_samples() > 1) ? lhs.num_samples() : lhs.nr(); - long lhs_cols = (lhs_is_matrix && lhs.k() > 1) ? lhs.k() : lhs.nc(); - long rhs_rows = (rhs_is_matrix && rhs.num_samples() > 1) ? rhs.num_samples() : rhs.nr(); - long rhs_cols = (rhs_is_matrix && rhs.k() > 1) ? rhs.k() : rhs.nc(); - long dest_rows = (dest_is_matrix && dest.num_samples() > 1) ? dest.num_samples() : dest.nr(); - long dest_cols = (dest_is_matrix && dest.k() > 1) ? dest.k() : dest.nc(); - - const size_t lhs_plane_size = lhs_rows * lhs_cols; - const size_t rhs_plane_size = rhs_rows * rhs_cols; - const size_t dest_plane_size = dest_rows * dest_cols; - + // Process each plane for (long b = 0; b < num_samples; ++b) { for (long c = 0; c < num_channels; ++c) { - auto lhs_slice = lhs_is_matrix ? alias_tensor(lhs_rows, lhs_cols)(lhs, 0) : + auto lhs_slice = lhs_is_matrix ? + alias_tensor(lhs_rows, lhs_cols)(lhs, 0) : alias_tensor(lhs_rows, lhs_cols)(lhs, (b * num_channels + c) * lhs_plane_size); - auto rhs_slice = rhs_is_matrix ? alias_tensor(rhs_rows, rhs_cols)(rhs, 0) : + auto rhs_slice = rhs_is_matrix ? + alias_tensor(rhs_rows, rhs_cols)(rhs, 0) : alias_tensor(rhs_rows, rhs_cols)(rhs, (b * num_channels + c) * rhs_plane_size); - auto dest_slice = dest_is_matrix ? alias_tensor(dest_rows, dest_cols)(dest, 0) : + auto dest_slice = dest_is_matrix ? + alias_tensor(dest_rows, dest_cols)(dest, 0) : alias_tensor(dest_rows, dest_cols)(dest, (b * num_channels + c) * dest_plane_size); if (beta != 0) @@ -1498,6 +1513,22 @@ namespace dlib { namespace tt #endif } +// ---------------------------------------------------------------------------------------- + + void apply_rotary_positional_embedding( + bool is_backward, + resizable_tensor& data, + const resizable_tensor& cos_cache, + const resizable_tensor& sin_cache + ) + { +#ifdef DLIB_USE_CUDA + cuda::apply_rotary_positional_embedding(is_backward, data, cos_cache, sin_cache); +#else + cpu::apply_rotary_positional_embedding(is_backward, data, cos_cache, sin_cache); +#endif + } + // ---------------------------------------------------------------------------------------- }} diff --git a/dlib/cuda/tensor_tools.h b/dlib/cuda/tensor_tools.h index 4079098cf6..4f906db37f 100644 --- a/dlib/cuda/tensor_tools.h +++ b/dlib/cuda/tensor_tools.h @@ -2519,6 +2519,39 @@ namespace dlib { namespace tt - scale_factor: scaling strength (0 = no scaling) !*/ +// ---------------------------------------------------------------------------------------- + + void apply_rotary_positional_embedding( + bool is_backward, + resizable_tensor& data, + const resizable_tensor& cos_cache, + const resizable_tensor& sin_cache + ); + /*! + requires + - data.nr() == cos_cache.nr() + - data.nr() == sin_cache.nr() + - cos_cache.nc() == data.nc() / 2 + - sin_cache.nc() == data.nc() / 2 + - cos_cache.num_samples() == 1 + - cos_cache.k() == 1 + - sin_cache.num_samples() == 1 + - sin_cache.k() == 1 + - data.nc() >= 2 + ensures + - Applies rotary positional embeddings (RoPE) to the input tensor + - data is modified in-place with the rotation applied pairwise to dimensions + - For each position pos and dimension pair (i, i+1): + if (!is_backward): + // Forward rotation (encoding) + data[pos,i] = data[pos,i] * cos_cache[pos,i/2] - data[pos,i+1] * sin_cache[pos,i/2] + data[pos,i+1] = data[pos,i] * sin_cache[pos,i/2] + data[pos,i+1] * cos_cache[pos,i/2] + else: + // Backward rotation (decoding, inverse transformation for gradients) + data[pos,i] = data[pos,i] * cos_cache[pos,i/2] + data[pos,i+1] * sin_cache[pos,i/2] + data[pos,i+1] = -data[pos,i] * sin_cache[pos,i/2] + data[pos,i+1] * cos_cache[pos,i/2] + !*/ + // ---------------------------------------------------------------------------------------- }} diff --git a/dlib/data_io.h b/dlib/data_io.h index 15c630e9e9..505f75108c 100644 --- a/dlib/data_io.h +++ b/dlib/data_io.h @@ -8,6 +8,7 @@ #include "data_io/mnist.h" #include "data_io/cifar.h" #include "data_io/arc_agi.h" +#include "data_io/language_model_data.h" #ifndef DLIB_ISO_CPP_ONLY #include "data_io/load_image_dataset.h" diff --git a/dlib/data_io/arc_agi.h b/dlib/data_io/arc_agi.h index 9153e8d4fd..64356dda8c 100644 --- a/dlib/data_io/arc_agi.h +++ b/dlib/data_io/arc_agi.h @@ -715,8 +715,8 @@ namespace dlib sequence.push_back(TOKEN_GEN_START); // Convert to dlib column vector - arc_token_sequence_t result(static_cast(sequence.size())); - for (long i = 0; i < static_cast(sequence.size()); ++i) + arc_token_sequence_t result(sequence.size()); + for (size_t i = 0; i < sequence.size(); ++i) result(i) = sequence[i]; return result; } @@ -736,8 +736,8 @@ namespace dlib append_flat_grid(sequence, test_pair.output); sequence.push_back(TOKEN_END_OF_OUTPUT); - arc_token_sequence_t result(static_cast(sequence.size())); - for (long i = 0; i < static_cast(sequence.size()); ++i) + arc_token_sequence_t result(sequence.size()); + for (size_t i = 0; i < sequence.size(); ++i) result(i) = sequence[i]; return result; } diff --git a/dlib/data_io/language_model_data.h b/dlib/data_io/language_model_data.h new file mode 100644 index 0000000000..d1f4aa6ae2 --- /dev/null +++ b/dlib/data_io/language_model_data.h @@ -0,0 +1,976 @@ +#ifndef DLIB_LANGUAGE_MODEL_DATA_H_ +#define DLIB_LANGUAGE_MODEL_DATA_H_ + +#include "language_model_data_abstract.h" + +#include +#include +#include +#include "../matrix.h" +#include "../serialize.h" + +namespace dlib +{ + + // --------------------------------------------------------------------------------- + + enum class file_content_type + { + TEXT_PLAIN, // Plain text file (including CSV, code, etc.) + TEXT_XML, // XML or HTML markup + IMAGE, // Image formats (PNG, JPEG, GIF, TIFF, BMP, etc.) + VIDEO, // Video formats (MP4, AVI, MKV, etc.) + AUDIO, // Audio formats (MP3, WAV, FLAC, etc.) + EXECUTABLE, // Executable files (EXE, DLL, ELF, Mach-O) + COMPRESSED, // Compressed archives (ZIP, GZIP, 7Z, RAR, etc.) + PDF, // PDF documents + OFFICE, // Office documents (DOCX, XLSX, PPTX, etc.) + UNKNOWN // Unknown or undetermined file type + }; + + // --------------------------------------------------------------------------------- + + namespace impl + { + // Magic number signature structure + struct magic_signature + { + const unsigned char* bytes; + size_t length; + file_content_type type; + size_t offset; // Byte offset where signature should appear + }; + + // Common magic number signatures (ordered by frequency/priority) + static const unsigned char sig_png[] = { 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A }; + static const unsigned char sig_jpg1[] = { 0xFF, 0xD8, 0xFF, 0xE0 }; + static const unsigned char sig_jpg2[] = { 0xFF, 0xD8, 0xFF, 0xE1 }; + static const unsigned char sig_jpg3[] = { 0xFF, 0xD8, 0xFF, 0xDB }; + static const unsigned char sig_jpg4[] = { 0xFF, 0xD8, 0xFF, 0xEE }; + static const unsigned char sig_gif87[] = { 0x47, 0x49, 0x46, 0x38, 0x37, 0x61 }; // GIF87a + static const unsigned char sig_gif89[] = { 0x47, 0x49, 0x46, 0x38, 0x39, 0x61 }; // GIF89a + static const unsigned char sig_tiff_le[] = { 0x49, 0x49, 0x2A, 0x00 }; // Little endian + static const unsigned char sig_tiff_be[] = { 0x4D, 0x4D, 0x00, 0x2A }; // Big endian + static const unsigned char sig_bmp[] = { 0x42, 0x4D }; + static const unsigned char sig_webp[] = { 0x52, 0x49, 0x46, 0x46 }; // RIFF (check for WEBP at offset 8) + + static const unsigned char sig_pdf[] = { 0x25, 0x50, 0x44, 0x46 }; // %PDF + + static const unsigned char sig_zip[] = { 0x50, 0x4B, 0x03, 0x04 }; + static const unsigned char sig_gzip[] = { 0x1F, 0x8B }; + static const unsigned char sig_7z[] = { 0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C }; + static const unsigned char sig_rar[] = { 0x52, 0x61, 0x72, 0x21, 0x1A, 0x07 }; + + static const unsigned char sig_exe[] = { 0x4D, 0x5A }; // MZ (DOS/Windows executable) + static const unsigned char sig_elf[] = { 0x7F, 0x45, 0x4C, 0x46 }; // ELF (Unix/Linux executable) + static const unsigned char sig_macho_32[] = { 0xFE, 0xED, 0xFA, 0xCE }; // Mach-O 32-bit + static const unsigned char sig_macho_64[] = { 0xFE, 0xED, 0xFA, 0xCF }; // Mach-O 64-bit + + static const unsigned char sig_mp3_id3[] = { 0x49, 0x44, 0x33 }; // ID3 + static const unsigned char sig_mp3_ff[] = { 0xFF, 0xFB }; + static const unsigned char sig_wav[] = { 0x52, 0x49, 0x46, 0x46 }; // RIFF (check for WAVE at offset 8) + static const unsigned char sig_flac[] = { 0x66, 0x4C, 0x61, 0x43 }; // fLaC + static const unsigned char sig_ogg[] = { 0x4F, 0x67, 0x67, 0x53 }; // OggS + + static const unsigned char sig_mp4[] = { 0x66, 0x74, 0x79, 0x70 }; // ftyp (at offset 4) + static const unsigned char sig_avi[] = { 0x52, 0x49, 0x46, 0x46 }; // RIFF (check for AVI at offset 8) + static const unsigned char sig_mkv[] = { 0x1A, 0x45, 0xDF, 0xA3 }; + + static const magic_signature signatures[] = { + // Images + {sig_png, sizeof(sig_png), file_content_type::IMAGE, 0}, + {sig_jpg1, sizeof(sig_jpg1), file_content_type::IMAGE, 0}, + {sig_jpg2, sizeof(sig_jpg2), file_content_type::IMAGE, 0}, + {sig_jpg3, sizeof(sig_jpg3), file_content_type::IMAGE, 0}, + {sig_jpg4, sizeof(sig_jpg4), file_content_type::IMAGE, 0}, + {sig_gif87, sizeof(sig_gif87), file_content_type::IMAGE, 0}, + {sig_gif89, sizeof(sig_gif89), file_content_type::IMAGE, 0}, + {sig_tiff_le, sizeof(sig_tiff_le), file_content_type::IMAGE, 0}, + {sig_tiff_be, sizeof(sig_tiff_be), file_content_type::IMAGE, 0}, + {sig_bmp, sizeof(sig_bmp), file_content_type::IMAGE, 0}, + + // PDF + {sig_pdf, sizeof(sig_pdf), file_content_type::PDF, 0}, + + // Compressed + {sig_zip, sizeof(sig_zip), file_content_type::COMPRESSED, 0}, + {sig_gzip, sizeof(sig_gzip), file_content_type::COMPRESSED, 0}, + {sig_7z, sizeof(sig_7z), file_content_type::COMPRESSED, 0}, + {sig_rar, sizeof(sig_rar), file_content_type::COMPRESSED, 0}, + + // Executables + {sig_exe, sizeof(sig_exe), file_content_type::EXECUTABLE, 0}, + {sig_elf, sizeof(sig_elf), file_content_type::EXECUTABLE, 0}, + {sig_macho_32, sizeof(sig_macho_32), file_content_type::EXECUTABLE, 0}, + {sig_macho_64, sizeof(sig_macho_64), file_content_type::EXECUTABLE, 0}, + + // Audio + {sig_mp3_id3, sizeof(sig_mp3_id3), file_content_type::AUDIO, 0}, + {sig_mp3_ff, sizeof(sig_mp3_ff), file_content_type::AUDIO, 0}, + {sig_flac, sizeof(sig_flac), file_content_type::AUDIO, 0}, + {sig_ogg, sizeof(sig_ogg), file_content_type::AUDIO, 0}, + + // Video + {sig_mp4, sizeof(sig_mp4), file_content_type::VIDEO, 4}, + {sig_mkv, sizeof(sig_mkv), file_content_type::VIDEO, 0} + }; + + // Portable case-insensitive string comparison (C++14 compatible) + inline bool iequals_n(const char* s1, const char* s2, size_t n) + { + for (size_t i = 0; i < n; ++i) + { + const char c1 = (s1[i] >= 'A' && s1[i] <= 'Z') ? s1[i] + 32 : s1[i]; + const char c2 = (s2[i] >= 'A' && s2[i] <= 'Z') ? s2[i] + 32 : s2[i]; + if (c1 != c2) return false; + } + return true; + } + + // Case-insensitive check for file extension + inline bool has_extension(const std::string& filename, const char* ext) + { + const size_t ext_len = std::strlen(ext); + if (filename.length() < ext_len) return false; + + const size_t start = filename.length() - ext_len; + for (size_t i = 0; i < ext_len; ++i) + { + const char fc = filename[start + i]; + const char ec = ext[i]; + const char fc_lower = (fc >= 'A' && fc <= 'Z') ? fc + 32 : fc; + const char ec_lower = (ec >= 'A' && ec <= 'Z') ? ec + 32 : ec; + if (fc_lower != ec_lower) return false; + } + return true; + } + + // Calculate Shannon entropy for a buffer + inline double calculate_entropy(const unsigned char* buffer, size_t length) + { + if (length == 0) return 0.0; + + // Count byte frequency + std::array counts = {}; + for (size_t i = 0; i < length; ++i) + counts[buffer[i]]++; + + // Calculate entropy using Shannon's formula: H = -sum(p * log2(p)) + double entropy = 0.0; + const double length_d = static_cast(length); + + for (size_t i = 0; i < 256; ++i) + { + if (counts[i] > 0) + { + const double probability = static_cast(counts[i]) / length_d; + entropy -= probability * std::log2(probability); + } + } + + return entropy; + } + + // Check if buffer contains mostly printable ASCII/UTF-8 text + inline bool is_text_content(const unsigned char* buffer, size_t length) + { + if (length == 0) return false; + + size_t printable_count = 0; + size_t whitespace_count = 0; + size_t control_count = 0; + + for (size_t i = 0; i < length; ++i) + { + const unsigned char ch = buffer[i]; + + // Common whitespace characters + if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') + { + whitespace_count++; + printable_count++; + } + // Printable ASCII range + else if (ch >= 32 && ch <= 126) + { + printable_count++; + } + // UTF-8 continuation bytes (10xxxxxx) + else if ((ch & 0xC0) == 0x80) + { + printable_count++; + } + // UTF-8 multi-byte sequence starts (110xxxxx, 1110xxxx, 11110xxx) + else if ((ch & 0xE0) == 0xC0 || (ch & 0xF0) == 0xE0 || (ch & 0xF8) == 0xF0) + { + printable_count++; + } + // Control characters (excluding common whitespace) + else if (ch < 32) + { + control_count++; + } + } + + // Consider as text if >90% printable and <10% control chars + const double printable_ratio = static_cast(printable_count) / length; + const double control_ratio = static_cast(control_count) / length; + + return printable_ratio > 0.90 && control_ratio < 0.10; + } + + // Check for XML/HTML markers + inline bool is_xml_content(const unsigned char* buffer, size_t length) + { + if (length < 5) return false; + + const char* str = reinterpret_cast(buffer); + + // Check for "= 5 && buffer[0] == '<' && buffer[1] == '?') + { + if (iequals_n(str + 2, "xml", 3)) + return true; + } + + // Check for HTML doctype (case-insensitive) + if (length >= 9 && buffer[0] == '<' && buffer[1] == '!') + { + if (iequals_n(str + 2, "DOCTYPE", 7)) + return true; + } + + // Check for HTML tags (case-insensitive) + if (length >= 6 && buffer[0] == '<') + { + if (iequals_n(str + 1, "html>", 5) || iequals_n(str + 1, "html ", 5)) + return true; + } + + return false; + } + + // Special check for RIFF-based formats (WAV, AVI, WEBP) + inline file_content_type check_riff_type(const unsigned char* buffer, size_t length) + { + if (length < 12) return file_content_type::UNKNOWN; + + // RIFF format: "RIFF" + size (4 bytes) + format type (4 bytes) + if (std::memcmp(buffer + 8, "WAVE", 4) == 0) + return file_content_type::AUDIO; + else if (std::memcmp(buffer + 8, "AVI ", 4) == 0) + return file_content_type::VIDEO; + else if (std::memcmp(buffer + 8, "WEBP", 4) == 0) + return file_content_type::IMAGE; + + return file_content_type::UNKNOWN; + } + + // Check if ZIP is actually an Office document (DOCX, XLSX, PPTX) + inline file_content_type check_office_type(const std::string& filename) + { + if (has_extension(filename, ".docx") || + has_extension(filename, ".xlsx") || + has_extension(filename, ".pptx")) + { + return file_content_type::OFFICE; + } + + return file_content_type::COMPRESSED; + } + } + + // --------------------------------------------------------------------------------- + + inline bool detect_file_type( + const std::string& filename, + file_content_type& detected_type + ) + { + detected_type = file_content_type::UNKNOWN; + + // Open file in binary mode + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) + return false; + + // Read initial bytes for analysis (8KB should be sufficient) + constexpr size_t BUFFER_SIZE = 8192; + std::array buffer; + + file.read(reinterpret_cast(buffer.data()), BUFFER_SIZE); + const size_t bytes_read = static_cast(file.gcount()); + file.close(); + + if (bytes_read == 0) + return false; + + // Step 1: Check for known magic number signatures + for (const auto& sig : impl::signatures) + { + if (bytes_read >= sig.offset + sig.length) + { + if (std::memcmp(buffer.data() + sig.offset, sig.bytes, sig.length) == 0) + { + detected_type = sig.type; + + // Special handling for RIFF-based formats + if (sig.bytes == impl::sig_webp || sig.bytes == impl::sig_wav || + sig.bytes == impl::sig_avi) + { + const auto riff_type = impl::check_riff_type(buffer.data(), bytes_read); + if (riff_type != file_content_type::UNKNOWN) + detected_type = riff_type; + } + + // Special handling for ZIP (could be Office document) + if (detected_type == file_content_type::COMPRESSED && + sig.bytes == impl::sig_zip) + { + detected_type = impl::check_office_type(filename); + } + + // Binary types + return false; + } + } + } + + // Step 2: Check for XML/HTML content + if (impl::is_xml_content(buffer.data(), bytes_read)) + { + detected_type = file_content_type::TEXT_XML; + return true; + } + + // Step 3: Calculate entropy to distinguish text from binary + const double entropy = impl::calculate_entropy(buffer.data(), bytes_read); + + // Step 4: Use heuristics to classify content + // Entropy thresholds: + // < 5.0 : Likely plain text + // 5.0-6.8: Could be text or structured binary + // > 6.8 : Likely compressed/encrypted/random binary + + const bool is_text = impl::is_text_content(buffer.data(), bytes_read); + + if (is_text && entropy < 6.5) + { + // High probability of plain text (< 5.5) + // Or could be text with some binary content (e.g., source code with special chars) + detected_type = file_content_type::TEXT_PLAIN; + return true; + } + + // Likely binary content (no recognized format) + detected_type = file_content_type::UNKNOWN; + return false; + } + + // --------------------------------------------------------------------------------- + + // Compute Levenshtein (edit) distance between two token sequences + inline size_t edit_distance(const std::vector& tokens1, const std::vector& tokens2) + { + const size_t len1 = tokens1.size(); + const size_t len2 = tokens2.size(); + + if (len1 == 0) return len2; + if (len2 == 0) return len1; + + // DP table: dp[i][j] = edit distance between tokens1[0..i-1] and tokens2[0..j-1] + std::vector> dp(len1 + 1, std::vector(len2 + 1)); + + // Initialize base cases + for (size_t i = 0; i <= len1; ++i) + dp[i][0] = i; + for (size_t j = 0; j <= len2; ++j) + dp[0][j] = j; + + // Fill DP table + for (size_t i = 1; i <= len1; ++i) { + for (size_t j = 1; j <= len2; ++j) { + if (tokens1[i - 1] == tokens2[j - 1]) { + dp[i][j] = dp[i - 1][j - 1]; // No edit needed + } + else { + dp[i][j] = 1 + std::min({ dp[i - 1][j], // Deletion + dp[i][j - 1], // Insertion + dp[i - 1][j - 1] // Substitution + }); + } + } + } + + return dp[len1][len2]; + } + + // Compute normalized edit distance as a similarity score between 0 and 1 + inline double normalized_edit_similarity(const std::vector& tokens1, const std::vector& tokens2) + { + if (tokens1.empty() && tokens2.empty()) + return 1.0; + + const size_t max_len = std::max(tokens1.size(), tokens2.size()); + if (max_len == 0) + return 1.0; + + const size_t dist = edit_distance(tokens1, tokens2); + return 1.0 - (static_cast(dist) / max_len); + } + + // Compute token-level precision, recall, and F1-score + struct token_overlap_metrics + { + double precision; // What fraction of generated tokens appear in reference + double recall; // What fraction of reference tokens appear in generated + double f1_score; // Harmonic mean of precision and recall + + void print() const + { + std::cout << "Token overlap metrics:\n" + << " Precision: " << std::fixed << std::setprecision(4) << (precision * 100.0) << "%\n" + << " Recall: " << std::fixed << std::setprecision(4) << (recall * 100.0) << "%\n" + << " F1-score: " << std::fixed << std::setprecision(4) << (f1_score * 100.0) << "%\n"; + } + }; + + inline token_overlap_metrics compute_token_overlap( + const std::vector& reference, + const std::vector& generated) + { + token_overlap_metrics metrics{ 0.0, 0.0, 0.0 }; + + if (reference.empty() || generated.empty()) + return metrics; + + // Count matching tokens + std::multiset ref_tokens(reference.begin(), reference.end()); + std::multiset gen_tokens(generated.begin(), generated.end()); + + size_t matches = 0; + for (int token : gen_tokens) { + auto it = ref_tokens.find(token); + if (it != ref_tokens.end()) { + ++matches; + ref_tokens.erase(it); // Remove to handle duplicates correctly + } + } + + // Calculate precision and recall + metrics.precision = static_cast(matches) / generated.size(); + metrics.recall = static_cast(matches) / reference.size(); + + // Calculate F1-score + if (metrics.precision + metrics.recall > 0.0) { + metrics.f1_score = 2.0 * (metrics.precision * metrics.recall) / + (metrics.precision + metrics.recall); + } + + return metrics; + } + + // Compute BLEU-like n-gram overlap score + inline double compute_ngram_overlap( + const std::vector& reference, + const std::vector& generated, + int max_n = 4) + { + if (reference.empty() || generated.empty()) + return 0.0; + + double total_score = 0.0; + int valid_n_count = 0; + + // Compute overlap for n-grams of size 1 to max_n + for (int n = 1; n <= max_n; ++n) { + if (static_cast(n) > reference.size() || + static_cast(n) > generated.size()) + break; + + // Extract n-grams from reference + std::map, size_t> ref_ngrams; + for (size_t i = 0; i <= reference.size() - n; ++i) { + std::vector ngram(reference.begin() + i, reference.begin() + i + n); + ref_ngrams[ngram]++; + } + + // Count matching n-grams in generated + size_t matches = 0; + size_t total_gen_ngrams = 0; + for (size_t i = 0; i <= generated.size() - n; ++i) { + std::vector ngram(generated.begin() + i, generated.begin() + i + n); + total_gen_ngrams++; + + auto it = ref_ngrams.find(ngram); + if (it != ref_ngrams.end() && it->second > 0) { + matches++; + it->second--; // Decrement to handle multiple occurrences + } + } + + if (total_gen_ngrams > 0) { + total_score += static_cast(matches) / total_gen_ngrams; + valid_n_count++; + } + } + + // Return average n-gram precision + return valid_n_count > 0 ? total_score / valid_n_count : 0.0; + } + + // Text similarity report + struct text_similarity_report + { + double edit_similarity; // Normalized Levenshtein distance + token_overlap_metrics overlap; // Token-level precision/recall/F1 + double ngram_score; // N-gram overlap (BLEU-like) + + void print() const + { + std::cout << "\n=== Text similarity report ===\n"; + std::cout << "Edit similarity (order-sensitive): " + << std::fixed << std::setprecision(4) << (edit_similarity * 100.0) << "%\n\n"; + + overlap.print(); + + std::cout << "\nN-gram overlap (BLEU-like): " + << std::fixed << std::setprecision(4) << (ngram_score * 100.0) << "%\n"; + std::cout << "==============================\n\n"; + } + }; + + inline text_similarity_report compute_text_similarity( + const std::vector& reference, + const std::vector& generated) + { + text_similarity_report report; + + report.edit_similarity = normalized_edit_similarity(reference, generated); + report.overlap = compute_token_overlap(reference, generated); + report.ngram_score = compute_ngram_overlap(reference, generated, 4); + + return report; + } + + class inference_context + { + public: + inference_context( + long window_size = 256, + long context_multiplier = 10, + long padding_token = 0 + ) : window_size_(window_size), + context_capacity_(window_size * context_multiplier), + padding_token_(padding_token), + current_size_(0) + { + DLIB_CASSERT(window_size > 0, "Window size must be positive"); + DLIB_CASSERT(context_multiplier > 0, "Context multiplier must be positive"); + context_.reserve(context_capacity_); + } + + void add_token(unsigned long token) + { + if (current_size_ == context_capacity_) + { + // FIFO: remove oldest, add newest + context_.erase(context_.begin()); + context_.push_back(static_cast(token)); + } + else + { + // Still room in context + context_.push_back(static_cast(token)); + current_size_++; + } + } + + void add_tokens(const std::vector& tokens) + { + for (unsigned long token : tokens) add_token(token); + } + + void add_tokens(const std::vector& tokens) + { + for (int token : tokens) add_token(static_cast(token)); + } + + matrix get_input_window(long custom_window_size = -1) const + { + long win_size = (custom_window_size > 0) ? custom_window_size : window_size_; + matrix window(win_size, 1); + + if (current_size_ >= win_size) + { + // Context has enough tokens - take last win_size tokens + for (long i = 0; i < win_size; ++i) + window(i) = context_[current_size_ - win_size + i]; + } + else + { + // Context has fewer tokens - left pad + long padding_needed = win_size - current_size_; + + for (long i = 0; i < padding_needed; ++i) + window(i) = padding_token_; + for (long i = 0; i < current_size_; ++i) + window(padding_needed + i) = context_[i]; + } + + return window; + } + + void reset() + { + context_.clear(); + current_size_ = 0; + } + + void resize_context(long new_capacity) + { + DLIB_CASSERT(new_capacity > 0, "New capacity must be positive"); + + if (new_capacity < current_size_) + { + // Keep only the last new_capacity tokens + context_.erase(context_.begin(), context_.begin() + (current_size_ - new_capacity)); + current_size_ = new_capacity; + } + + context_capacity_ = new_capacity; + context_.reserve(context_capacity_); + } + + long size() const { return current_size_; } + long capacity() const { return context_capacity_; } + long window_size() const { return window_size_; } + bool is_full() const { return current_size_ >= context_capacity_; } + const std::vector& get_full_context() const { return context_; } + + std::string to_string(bool show_all = false) const + { + std::ostringstream ss; + ss << "InferenceContext[size=" << current_size_ + << "/" << context_capacity_ + << ", window=" << window_size_ << "]\n"; + + if (show_all && current_size_ > 0) + { + ss << "Tokens: ["; + long display_count = show_all ? current_size_ : std::min(20L, current_size_); + for (long i = 0; i < display_count; ++i) + { + ss << context_[i]; + if (i < display_count - 1) ss << ", "; + } + if (current_size_ > display_count) + { + ss << " ... +" << (current_size_ - display_count) << " more"; + } + ss << "]"; + } + + return ss.str(); + } + + friend void serialize(const inference_context& item, std::ostream& out) + { + serialize("inference_context", out); + serialize(item.window_size_, out); + serialize(item.context_capacity_, out); + serialize(item.padding_token_, out); + serialize(item.current_size_, out); + serialize(item.context_, out); + } + + friend void deserialize(inference_context& item, std::istream& in) + { + std::string name; + deserialize(name, in); + if (name != "inference_context") + { + throw serialization_error("Error deserializing object of type 'inference_context': " + "expected 'inference_context' but got '" + name + "'"); + } + + deserialize(item.window_size_, in); + deserialize(item.context_capacity_, in); + deserialize(item.padding_token_, in); + deserialize(item.current_size_, in); + deserialize(item.context_, in); + } + + private: + std::vector context_; // Full context history + long window_size_; // Window size for model input + long context_capacity_; // Maximum context size + long padding_token_; // Token used for left padding + long current_size_; // Current number of tokens + }; + + inline void build_single_token_prediction_dataset( + const std::vector>& token_sequences, + long window_len, + long padding_token, + bool use_left_padding, + std::vector>& X, + std::vector& Y) + { + X.clear(); + Y.clear(); + + for (const auto& seq : token_sequences) + { + const long len = static_cast(seq.size()); + if (len <= 1) continue; + + long start = 0; + if (len < window_len) + { + if (!use_left_padding) continue; + start = (len - window_len); + } + + // Generate initial padded samples for sequences >= window_len + if (use_left_padding && len >= window_len) + { + for (long pos = 1; pos < window_len; ++pos) + { + matrix window(window_len, 1); + long pad = window_len - pos; + + for (long i = 0; i < pad; ++i) window(i) = padding_token; + for (long i = 0; i < pos; ++i) window(pad + i) = seq[i]; + + X.push_back(window); + Y.push_back(seq[pos]); + } + } + + // Slide window through sequence + for (long pos = start; pos < len - 1; ++pos) + { + matrix window(window_len, 1); + + for (long i = 0; i < window_len; ++i) + { + long idx = pos + i; + window(i) = (idx >= 0 && idx < len) ? seq[idx] : padding_token; + } + + long target_idx = pos + window_len; + if (target_idx >= 0 && target_idx < len) + { + X.push_back(window); + Y.push_back(seq[target_idx]); + } + } + } + } + + inline void build_multi_token_prediction_dataset( + const std::vector>& source_sequences, + const std::vector>& target_sequences, + long src_window_len, + long tgt_window_len, + long padding_token, + std::vector>& X, + std::vector>& Y) + { + DLIB_CASSERT(source_sequences.size() == target_sequences.size(), + "Source and target must have same size"); + + X.clear(); + Y.clear(); + + for (size_t i = 0; i < source_sequences.size(); ++i) + { + const auto& src = source_sequences[i]; + const auto& tgt = target_sequences[i]; + + const long src_len = static_cast(src.size()); + const long tgt_len = static_cast(tgt.size()); + + if (src_len == 0 || tgt_len == 0) continue; + + long src_pos = (src_len < src_window_len) ? (src_len - src_window_len) : 0; + long tgt_pos = 0; + + while (true) + { + // Build source window + matrix src_window(src_window_len, 1); + long src_real = 0; + + for (long j = 0; j < src_window_len; ++j) + { + long idx = src_pos + j; + if (idx >= 0 && idx < src_len) + { + src_window(j) = src[idx]; + src_real++; + } + else + { + src_window(j) = padding_token; + } + } + + // Build target window + matrix tgt_window(tgt_window_len, 1); + long tgt_real = 0; + + for (long j = 0; j < tgt_window_len; ++j) + { + long idx = tgt_pos + j; + if (idx < tgt_len) + { + tgt_window(j) = tgt[idx]; + tgt_real++; + } + else + { + tgt_window(j) = padding_token; + } + } + + // Stop if no real tokens in either window + if (src_real == 0 || tgt_real == 0) break; + + X.push_back(src_window); + Y.push_back(tgt_window); + + // Stop if both sequences fully consumed + if (src_pos + src_window_len >= src_len && + tgt_pos + tgt_window_len >= tgt_len) break; + + src_pos++; + tgt_pos++; + } + } + } + + template + void shuffle_training_dataset( + std::vector& samples, + std::vector& labels, + unsigned long seed = 0) + { + DLIB_CASSERT(samples.size() == labels.size(), + "samples and labels must have the same size"); + + const size_t dataset_size = samples.size(); + if (dataset_size <= 1) return; + + dlib::rand rng; + if (seed != 0) rng = dlib::rand(seed); + + // Fisher-Yates shuffle algorithm + for (size_t i = dataset_size - 1; i > 0; --i) + { + size_t j = rng.get_random_32bit_number() % (i + 1); + + // Swap samples[i] with samples[j] + std::swap(samples[i], samples[j]); + + // Swap labels[i] with labels[j] + std::swap(labels[i], labels[j]); + } + } + + template + void augment_training_dataset( + std::vector& samples, + std::vector& labels, + int unk_token, + int padding_token, + double augmentation_ratio = 0.2, + long min_noise_tokens = 1, + long max_noise_tokens = 3, + unsigned long seed = 0) + { + DLIB_CASSERT(samples.size() == labels.size(), + "samples and labels must have the same size"); + DLIB_CASSERT(augmentation_ratio >= 0.0 && augmentation_ratio <= 2.0, + "augmentation_ratio must be between 0.0 and 2.0"); + DLIB_CASSERT(min_noise_tokens >= 0 && max_noise_tokens >= min_noise_tokens, + "Invalid noise token range: min=" << min_noise_tokens << ", max=" << max_noise_tokens); + + const size_t original_size = samples.size(); + if (original_size == 0 || augmentation_ratio == 0.0) return; + + // Calculate number of augmented samples to create + const size_t num_augmented = static_cast(original_size * augmentation_ratio); + if (num_augmented == 0) return; + + // Reserve space to avoid multiple reallocations + samples.reserve(original_size + num_augmented); + labels.reserve(original_size + num_augmented); + + dlib::rand rng; + if (seed != 0) rng = dlib::rand(seed); + + for (size_t aug_idx = 0; aug_idx < num_augmented; ++aug_idx) + { + // Select a random sample to augment + const size_t source_idx = rng.get_random_32bit_number() % original_size; + + // Create a copy of the sample and its label + auto augmented_sample = samples[source_idx]; + auto augmented_label = labels[source_idx]; + + // Identify non-padding positions in the sample + std::vector valid_positions; + const long sample_length = augmented_sample.nr(); + + for (long i = 0; i < sample_length; ++i) + { + if (augmented_sample(i) != padding_token) + valid_positions.push_back(i); + } + + // Skip if no valid positions to add noise + if (valid_positions.empty()) continue; + + // Determine number of tokens to replace with noise + const long num_valid = static_cast(valid_positions.size()); + const long effective_max = std::min(max_noise_tokens, num_valid); + const long effective_min = std::min(min_noise_tokens, effective_max); + + long num_noise = effective_min; + if (effective_max > effective_min) + { + num_noise = effective_min + + (rng.get_random_32bit_number() % (effective_max - effective_min + 1)); + } + + // Ensure noise ratio is reasonable (max 30% of non-padding tokens) + const long max_reasonable = std::max(1L, static_cast(num_valid * 0.3)); + num_noise = std::min(num_noise, max_reasonable); + + // Randomly select positions to replace with UNK + std::vector noise_positions = valid_positions; + + // Fisher-Yates shuffle to select random positions + for (long i = static_cast(noise_positions.size()) - 1; i > 0; --i) + { + long j = rng.get_random_32bit_number() % (i + 1); + std::swap(noise_positions[i], noise_positions[j]); + } + + // Apply noise to the first num_noise positions + for (long i = 0; i < num_noise; ++i) + { + augmented_sample(noise_positions[i]) = unk_token; + } + + // Add augmented sample and label to the dataset + samples.push_back(std::move(augmented_sample)); + labels.push_back(std::move(augmented_label)); + } + } + +} // namespace dlib + +#endif // DLIB_LANGUAGE_MODEL_DATA_H_ \ No newline at end of file diff --git a/dlib/data_io/language_model_data_abstract.h b/dlib/data_io/language_model_data_abstract.h new file mode 100644 index 0000000000..2b797223e2 --- /dev/null +++ b/dlib/data_io/language_model_data_abstract.h @@ -0,0 +1,556 @@ +// Copyright (C) 2025 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_LANGUAGE_MODEL_DATA_ABSTRACT_H_ +#ifdef DLIB_LANGUAGE_MODEL_DATA_ABSTRACT_H_ + +#include +#include +#include +#include "../matrix.h" +#include "../serialize.h" + +namespace dlib +{ + // --------------------------------------------------------------------------------- + + enum class file_content_type + { + /*! + WHAT THIS ENUM REPRESENTS + Enumeration of recognized file content types for classification purposes. + Used by detect_file_type() to identify the nature of file contents. + + VALUES + TEXT_PLAIN - Plain text files (including CSV, source code, logs, etc.) + TEXT_XML - XML or HTML markup documents + IMAGE - Image formats (PNG, JPEG, GIF, TIFF, BMP, WEBP, etc.) + VIDEO - Video formats (MP4, AVI, MKV, etc.) + AUDIO - Audio formats (MP3, WAV, FLAC, OGG, etc.) + EXECUTABLE - Executable binary files (EXE, DLL, ELF, Mach-O) + COMPRESSED - Compressed archives (ZIP, GZIP, 7Z, RAR, etc.) + PDF - PDF documents + OFFICE - Office documents (DOCX, XLSX, PPTX) + UNKNOWN - File type could not be determined or is not recognized + + NOTES + - Detection is based on file content analysis, not file extensions + - Magic number signatures are checked first for binary formats + - Entropy analysis and heuristics are used for text vs binary classification + !*/ + }; + + // --------------------------------------------------------------------------------- + + inline bool detect_file_type( + const std::string& filename, + file_content_type& detected_type + ); + /*! + ensures + - Efficiently detects the content type of a file by analyzing its internal + structure using magic number signatures and entropy-based heuristics + - Opens and reads the first 8KB of the file for analysis + - Returns true if file contains text-based content (TEXT_PLAIN or TEXT_XML) + - Returns false if file contains binary content or cannot be opened + - Sets detected_type to the most specific content type that could be identified + - If file cannot be opened, returns false and sets detected_type to UNKNOWN + + FILE DETECTION METHODOLOGY + The function uses a multi-stage detection process: + + Stage 1: magic number detection (Binary Formats) + - Checks for ~30 common file format signatures (magic numbers) + - Supported formats include: + * Images: PNG, JPEG (4 variants), GIF (87a/89a), TIFF (LE/BE), BMP, WEBP + * Documents: PDF + * Compressed: ZIP, GZIP, 7Z, RAR + * Executables: Windows PE (EXE/DLL), Unix ELF, macOS Mach-O (32/64-bit) + * Audio: MP3 (ID3/FF), WAV, FLAC, OGG + * Video: MP4, AVI, MKV + - Special handling for container formats: + * RIFF containers (WAV/AVI/WEBP) are distinguished by format identifier + * ZIP files are checked against filename to detect Office documents (DOCX/XLSX/PPTX) + - If magic number is found, returns false (binary) with appropriate type + + Stage 2: XML/HTML detection + - Checks for XML declarations (90% printable characters + * <10% control characters + * Entropy < 5.5 (high confidence text) + * Entropy < 6.5 (text with special characters) + * Entropy >= 6.8 (likely binary/compressed/encrypted) + + TYPICAL USAGE + file_content_type type; + + // Detect file type + bool is_text = detect_file_type("document.pdf", type); + + if (type == file_content_type::PDF) + std::cout << "PDF document detected\n"; + else if (type == file_content_type::IMAGE) + std::cout << "Image file detected\n"; + else if (is_text) + std::cout << "Text file detected\n"; + else + std::cout << "Binary file or unknown format\n"; + + // Filter text files for processing + std::vector filenames = get_file_list(); + for (const auto& fname : filenames) + { + file_content_type ftype; + if (detect_file_type(fname, ftype)) + { + // Process text file + process_text_file(fname); + } + } + !*/ + + // --------------------------------------------------------------------------------- + + inline size_t edit_distance( + const std::vector& tokens1, + const std::vector& tokens2 + ); + /*! + ensures + - Computes the Levenshtein (edit) distance between two token sequences + - Returns the minimum number of single-token edits (insertions, deletions, + or substitutions) required to transform tokens1 into tokens2 + - Uses dynamic programming with O(n*m) time complexity and O(n*m) space + - Returns tokens2.size() if tokens1 is empty + - Returns tokens1.size() if tokens2 is empty + - Returns 0 if both sequences are identical + !*/ + + inline double normalized_edit_similarity( + const std::vector& tokens1, + const std::vector& tokens2 + ); + /*! + ensures + - Computes a normalized similarity score based on edit distance + - Returns a value in the range [0.0, 1.0] where: + * 1.0 indicates identical sequences + * 0.0 indicates completely different sequences + - Formula: 1.0 - (edit_distance / max_length) + - If both sequences are empty, returns 1.0 (considered identical) + - This metric is order-sensitive: [1,2,3] vs [3,2,1] will have low similarity + !*/ + + // --------------------------------------------------------------------------------- + + struct token_overlap_metrics + { + /*! + WHAT THIS OBJECT REPRESENTS + Stores token-level evaluation metrics that treat sequences as + bags of tokens (order-independent). Useful for assessing vocabulary + overlap between reference and generated text. + + FIELDS + precision - Fraction of generated tokens that appear in the reference + Range: [0.0, 1.0] + Formula: matching_tokens / total_generated_tokens + + recall - Fraction of reference tokens that appear in the generated text + Range: [0.0, 1.0] + Formula: matching_tokens / total_reference_tokens + + f1_score - Harmonic mean of precision and recall + Range: [0.0, 1.0] + Formula: 2 * (precision * recall) / (precision + recall) + + INTERPRETATION + - High precision: generated text uses vocabulary from reference + - High recall: generated text covers reference vocabulary + - High F1: good balance between precision and recall + - Unlike edit distance, this metric ignores token order + !*/ + + double precision; + double recall; + double f1_score; + + void print() const; + /*! + ensures + - Prints formatted metrics to standard output + - Format: "Precision: XX.XX%\n Recall: XX.XX%\n F1-score: XX.XX%" + !*/ + }; + + inline token_overlap_metrics compute_token_overlap( + const std::vector& reference, + const std::vector& generated + ); + /*! + ensures + - Computes token-level precision, recall, and F1-score between reference + and generated token sequences + - Treats sequences as multisets (bags) of tokens, ignoring order + - Handles duplicate tokens correctly by matching each token at most once + - Returns metrics with all values set to 0.0 if either sequence is empty + - Precision = fraction of generated tokens found in reference + - Recall = fraction of reference tokens found in generated + - F1 = harmonic mean of precision and recall + !*/ + + // --------------------------------------------------------------------------------- + + inline double compute_ngram_overlap( + const std::vector& reference, + const std::vector& generated, + int max_n = 4 + ); + /*! + requires + - max_n >= 1 + ensures + - Computes n-gram overlap score similar to BLEU metric + - Evaluates matching n-grams for n = 1, 2, 3, ..., max_n + - Returns average n-gram precision across all n values + - Score range: [0.0, 1.0] where 1.0 is perfect overlap + - Returns 0.0 if either sequence is empty + - Stops computing for n-values where n > sequence length + + COMPARISON TO BLEU + - Similar to BLEU but simplified (no brevity penalty, no geometric mean) + - Uses arithmetic mean instead of geometric mean for simplicity + - Suitable for quick similarity assessment in language model evaluation + !*/ + + // --------------------------------------------------------------------------------- + + struct text_similarity_report + { + /*! + WHAT THIS OBJECT REPRESENTS + Comprehensive similarity report combining multiple metrics to evaluate + how closely generated text matches reference text. Provides both + order-sensitive and order-insensitive measures. + + FIELDS + edit_similarity - Normalized Levenshtein distance (order-sensitive) + Range: [0.0, 1.0] + Measures token-by-token match considering order + + overlap - Token-level precision/recall/F1 metrics + Order-insensitive bag-of-tokens comparison + Useful for vocabulary coverage assessment + + ngram_score - BLEU-like n-gram overlap score (order-aware locally) + Range: [0.0, 1.0] + Captures phrase-level similarity + + INTERPRETATION GUIDE + Use edit_similarity when: + - Exact token order matters + - Evaluating sequence prediction tasks + - Need strict alignment measure + + Use overlap metrics when: + - Vocabulary coverage is important + - Order is less critical + - Want to know what fraction of tokens are correct + + Use ngram_score when: + - Local phrase structure matters + - Evaluating fluency and coherence + - Need metric between strict order and pure bag-of-words + !*/ + + double edit_similarity; + token_overlap_metrics overlap; + double ngram_score; + + void print() const; + /*! + ensures + - Prints comprehensive formatted report to standard output + - Displays all three metric categories with clear labels + - Format optimized for readability with percentages and section headers + !*/ + }; + + inline text_similarity_report compute_text_similarity( + const std::vector& reference, + const std::vector& generated + ); + /*! + ensures + - Computes comprehensive similarity metrics between reference and generated + token sequences + - Returns text_similarity_report containing: + * edit_similarity: normalized Levenshtein distance + * overlap: token-level precision/recall/F1 scores + * ngram_score: BLEU-like n-gram overlap (up to 4-grams) + - This is the primary function for evaluating text generation quality + - Provides multiple complementary views of similarity + !*/ + + // --------------------------------------------------------------------------------- + + class inference_context + { + /*! + WHAT THIS OBJECT REPRESENTS + This class manages a token context for inference with language models. + It maintains a full history context and provides a sliding window view + for model input. + + Features: + - Full context history with configurable capacity + - Sliding window extraction for model input + - Left padding when context not full + - FIFO policy when context reaches capacity + - Dynamic resizing without data loss + + TYPICAL USAGE + inference_context ctx(256, 10, 0); // window=256, capacity=2560, pad=0 + + ctx.add_tokens({1, 2, 3, 4, 5}); // Add tokens + auto input = ctx.get_input_window(); // Get last 256 tokens (padded if needed) + + // Feed to model, get prediction, add to context + unsigned long next_token = model(input); + ctx.add_token(next_token); + !*/ + public: + inference_context( + long window_size = 256, + long context_multiplier = 10, + long padding_token = 0 + ); + /*! + requires + - window_size > 0 + - context_multiplier > 0 + ensures + - Constructs an inference context manager + - context_capacity = window_size * context_multiplier + - Context is initially empty (will be left-padded) + !*/ + + void add_token(unsigned long token); + /*! + ensures + - Adds a single token to the context + - If context is full, removes oldest token (FIFO) + - New token is always added at the end + !*/ + + void add_tokens(const std::vector& tokens); + void add_tokens(const std::vector& tokens); + /*! + ensures + - Adds multiple tokens to the context + - Tokens are added in order + - FIFO policy applies if capacity exceeded + !*/ + + matrix get_input_window(long custom_window_size = -1) const; + /*! + ensures + - Returns a window of tokens suitable for model input + - Window size is custom_window_size if specified, otherwise window_size_ + - Window contains the last N tokens from context + - Left-padded with padding_token if context has fewer than N tokens + - Returns matrix of shape (N, 1) compatible with Dlib + !*/ + + void reset(); + /*! + ensures + - Clears all tokens from context + - Resets current_size to 0 + - Context capacity remains unchanged + !*/ + + void resize_context(long new_capacity); + /*! + requires + - new_capacity > 0 + ensures + - Resizes the context capacity + - Preserves existing tokens (up to new capacity) + - If new_capacity < current_size, keeps only the last new_capacity tokens + !*/ + + long size() const; + /*! + ensures + - Returns the current number of tokens in context + !*/ + + long capacity() const; + /*! + ensures + - Returns the maximum capacity of the context + !*/ + + long window_size() const; + /*! + ensures + - Returns the default window size for model input + !*/ + + bool is_full() const; + /*! + ensures + - Returns true if context is at full capacity + !*/ + + const std::vector& get_full_context() const; + /*! + ensures + - Returns a const reference to the full context vector + !*/ + + std::string to_string(bool show_all = false) const; + /*! + ensures + - Returns a string representation of the context for debugging + !*/ + + friend void serialize(const inference_context& item, std::ostream& out); + /*! + ensures + - Serializes the inference_context to an output stream + - Saves all context data and configuration parameters + !*/ + + friend void deserialize(inference_context& item, std::istream& in); + /*! + ensures + - Deserializes the inference_context from an input stream + - Restores all context data and configuration parameters + !*/ + + private: + std::vector context_; // Full context history + long context_capacity_; // Maximum context size + long window_size_; // Window size for model input + long padding_token_; // Token used for left padding + long current_size_; // Current number of tokens + }; + + inline void build_single_token_prediction_dataset( + const std::vector>& token_sequences, + long window_len, + long padding_token, + bool use_left_padding, + std::vector>& X, + std::vector& Y); + /*! + ensures + - Constructs training samples for single next-token prediction using a sliding window approach + - For each sequence, creates input windows of size window_len paired with the immediately following token + - If use_left_padding is true: + * Sequences shorter than window_len are left-padded with padding_token + * Sequences >= window_len generate initial samples with progressive left padding + - If use_left_padding is false: + * Sequences shorter than window_len are skipped + - Returns samples in X (input windows) and Y (target tokens) + - X contains matrix of shape (window_len, 1) + - Y contains unsigned long values representing the next token + !*/ + + inline void build_multi_token_prediction_dataset( + const std::vector>& source_sequences, + const std::vector>& target_sequences, + long src_window_len, + long tgt_window_len, + long padding_token, + std::vector>& X, + std::vector>& Y); + /*! + requires + - source_sequences.size() == target_sequences.size() + - src_window_len > 0 + - tgt_window_len > 0 + ensures + - Constructs training samples for sequence-to-sequence prediction + - For each (source, target) pair, creates aligned windows that slide synchronously + - Source windows are left-padded with padding_token when source length < src_window_len + - Target windows are right-padded with padding_token when insufficient tokens remain + - Sliding continues while both windows contain at least one real (non-padding) token + - Stops when both sequences are fully consumed (all tokens have appeared in windows) + - Returns samples in X (source windows) and Y (target windows) + - X contains matrix of shape (src_window_len, 1) + - Y contains matrix of shape (tgt_window_len, 1) + !*/ + + template + void shuffle_training_dataset( + std::vector& samples, + std::vector& labels, + unsigned long seed = 0 + ); + /*! + requires + - samples.size() == labels.size() + ensures + - Randomly shuffles the training dataset in-place + - Applies the same permutation to both samples and labels to maintain correspondence + - If seed == 0, uses a random seed based on current time + - If seed != 0, uses the provided seed for reproducible shuffling + - After shuffling, samples[i] still corresponds to labels[i] + - Uses Fisher-Yates shuffle algorithm for uniform random permutation + !*/ + + template + void augment_training_dataset( + std::vector& samples, + std::vector& labels, + int unk_token, + int padding_token, + double augmentation_ratio = 0.2, + long min_noise_tokens = 1, + long max_noise_tokens = 3, + unsigned long seed = 0 + ); + /*! + requires + - samples.size() == labels.size() + - 0.0 <= augmentation_ratio <= 2.0 + - min_noise_tokens >= 0 + - max_noise_tokens >= min_noise_tokens + ensures + - Augments the training dataset by adding noisy copies of existing samples + - Creates floor(samples.size() * augmentation_ratio) new augmented samples + - For each augmented sample: + * Randomly selects a source sample from the original dataset + * Creates a copy of the sample and its corresponding label + * Randomly replaces between min_noise_tokens and max_noise_tokens + non-padding tokens with unk_token + * Only tokens != padding_token are eligible for noise injection + * Number of noise tokens is capped at 30% of non-padding tokens + to maintain sample quality + - Corresponding labels are appended to labels vector (unchanged) + - Original samples and labels are preserved + - If seed == 0, uses random seed based on current time + - If seed != 0, uses provided seed for reproducible augmentation + - Default augmentation_ratio of 0.2 (20%) follows common practices + in language model training literature + !*/ + +} // namespace dlib + +#endif // DLIB_LANGUAGE_MODEL_DATA_ABSTRACT_H_ \ No newline at end of file diff --git a/dlib/dnn.h b/dlib/dnn.h index bc38dc4b73..313c19b6f7 100644 --- a/dlib/dnn.h +++ b/dlib/dnn.h @@ -32,6 +32,7 @@ #include "dnn/utilities.h" #include "dnn/validation.h" #include "dnn/visitors.h" +#include "dnn/transformer.h" #endif // DLIB_DNn_ diff --git a/dlib/dnn/core.h b/dlib/dnn/core.h index dd4f7eb0fb..0b71712e69 100644 --- a/dlib/dnn/core.h +++ b/dlib/dnn/core.h @@ -1,3896 +1,3942 @@ -// Copyright (C) 2015 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_DNn_CORE_H_ -#define DLIB_DNn_CORE_H_ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "core_abstract.h" -#include "../cuda/tensor.h" -#include "../cuda/tensor_tools.h" -#include "../statistics.h" -#include "../rand.h" -#include "../algs.h" -#include "../metaprogramming.h" -#include "../utility.h" -#include "../constexpr_if.h" - -#ifdef _MSC_VER -// Tell Visual Studio not to recursively inline functions very much because otherwise it -// takes hours to compile the DNN code sometimes. It's crazy. Hopefully we can remove -// this some day when the visual studio compiler is more efficient. -#pragma inline_depth(2) -#endif - - -namespace dlib -{ - -// ---------------------------------------------------------------------------------------- - - namespace impl - { - template - using has_get_learning_rate_multiplier = decltype(std::declval().get_learning_rate_multiplier()); - - template - using has_set_learning_rate_multiplier = decltype(std::declval().set_learning_rate_multiplier(double{})); - - template - using has_get_bias_learning_rate_multiplier = decltype(std::declval().get_bias_learning_rate_multiplier()); - - template - using has_set_bias_learning_rate_multiplier = decltype(std::declval().set_bias_learning_rate_multiplier(double{})); - - template - using has_get_weight_decay_multiplier = decltype(std::declval().get_weight_decay_multiplier()); - - template - using has_set_weight_decay_multiplier = decltype(std::declval().set_weight_decay_multiplier(double{})); - - template - using has_get_bias_weight_decay_multiplier = decltype(std::declval().get_bias_weight_decay_multiplier()); - - template - using has_set_bias_weight_decay_multiplier = decltype(std::declval().set_bias_weight_decay_multiplier(double{})); - - template - using has_disable_bias = decltype(std::declval().disable_bias()); - - template - using has_clean = decltype(std::declval().clean()); - } - -// ---------------------------------------------------------------------------------------- - - template - double get_learning_rate_multiplier(const T& obj) - { - return switch_(bools(is_detected{}), - [&](true_t, auto _) { return _(obj).get_learning_rate_multiplier(); }, - [](auto...) { return 1.0; } - ); - } - - template - void set_learning_rate_multiplier( - T& obj, - double learning_rate_multiplier - ) - { - DLIB_CASSERT(learning_rate_multiplier >= 0); - switch_(bools(is_detected{}), - [&](true_t, auto _) { _(obj).set_learning_rate_multiplier(learning_rate_multiplier); }, - [](auto...) {/*no-op*/} - ); - } - -// ---------------------------------------------------------------------------------------- - - template - double get_bias_learning_rate_multiplier(const T& obj) - { - return switch_(bools(is_detected{}), - [&](true_t, auto _) { return _(obj).get_bias_learning_rate_multiplier(); }, - [](auto...) { return 1.0; } - ); - } - - template - void set_bias_learning_rate_multiplier( - T& obj, - double bias_learning_rate_multiplier - ) - { - DLIB_CASSERT(bias_learning_rate_multiplier >= 0); - switch_(bools(is_detected{}), - [&](true_t, auto _) { _(obj).set_bias_learning_rate_multiplier(bias_learning_rate_multiplier); }, - [](auto...) {/*no-op*/} - ); - } - -// ---------------------------------------------------------------------------------------- - - template - double get_weight_decay_multiplier(const T& obj) - { - return switch_(bools(is_detected{}), - [&](true_t, auto _) { return _(obj).get_weight_decay_multiplier(); }, - [](auto...) { return 1.0; } - ); - } - - template - void set_weight_decay_multiplier( - T& obj, - double weight_decay_multiplier - ) - { - DLIB_CASSERT(weight_decay_multiplier >= 0); - switch_(bools(is_detected{}), - [&](true_t, auto _) { _(obj).set_weight_decay_multiplier(weight_decay_multiplier); }, - [](auto...) {/*no-op*/} - ); - } - -// ---------------------------------------------------------------------------------------- - - template - double get_bias_weight_decay_multiplier(const T& obj) - { - return switch_(bools(is_detected{}), - [&](true_t, auto _) { return _(obj).get_bias_weight_decay_multiplier(); }, - [](auto...) { return 1.0; } - ); - } - - template - void set_bias_weight_decay_multiplier( - T& obj, - double bias_weight_decay_multiplier - ) - { - DLIB_CASSERT(bias_weight_decay_multiplier >= 0); - switch_(bools(is_detected{}), - [&](true_t, auto _) { _(obj).set_bias_weight_decay_multiplier(bias_weight_decay_multiplier); }, - [](auto...) {/*no-op*/} - ); - } - -// ---------------------------------------------------------------------------------------- - - template - void disable_bias( - T& obj - ) - { - switch_(bools(is_detected{}), - [&](true_t, auto _) { _(obj).disable_bias(); }, - [](auto...) { /*no-op*/ } - ); - } - -// ---------------------------------------------------------------------------------------- - - template - void call_clean_method_if_exists(T& obj) - /*! - ensures - - calls obj.clean() if obj has a .clean() method. - !*/ - { - switch_(bools(is_detected{}), - [&](true_t, auto _) { _(obj).clean(); }, - [](auto...) { /*no-op*/ } - ); - } - -// ---------------------------------------------------------------------------------------- - - namespace impl - { - class repeat_input_layer - { - /*! - None of the declarations in this object are really used. The only reason it - exists is to allow the repeat object to use a special input layer in its - internal networks which will cause add_tag_layer objects that happen to be - right at the input to not create copies of their input tensors. So - introducing the repeat_input_layer object allows us to optimize the - implementation of add_tag_layer for a special case that arises when it's - used in the context of the repeat layer. - !*/ - public: - typedef int input_type; - - template - void to_tensor ( - forward_iterator , - forward_iterator , - resizable_tensor& - ) const - { - } - - friend void serialize(const repeat_input_layer&, std::ostream&){} - friend void deserialize(repeat_input_layer&, std::istream&){} - friend std::ostream& operator<<(std::ostream& out, const repeat_input_layer&) { return out; } - }; - - inline std::string tensor_to_str ( - const tensor& t, - int& min_length - ) - { - if (t.size() == 0) - return ""; - - std::ostringstream sout; - sout << "output size=(num:"<< t.num_samples() << ", "; - sout << "k:" << t.k() << ","; - while (sout.tellp() < 28) sout << " "; - sout << "nr:" << t.nr() << ","; - while (sout.tellp() < 28+8) sout << " "; - sout << "nc:" << t.nc() << ")"; - while (sout.tellp() < min_length) sout << " "; - min_length = sout.tellp(); - sout << "\t"; - return sout.str(); - } - } - -// ---------------------------------------------------------------------------------------- - - // Tell us if T is one of the special layer types (i.e. add_layer, repeat, add_tag_layer, or - // add_skip_layer). - template struct is_nonloss_layer_type : std::false_type {}; - // Tell us if T is an instance of add_loss_layer. - template struct is_loss_layer_type : std::false_type {}; - // Tell us if T is an instance of add_layer - template struct is_add_layer : std::false_type {}; - - namespace impl - { - template - auto tuple_subset( - const Tuple& item, - std::index_sequence - ) - { - return std::make_tuple(std::get(item)...); - } - - template - auto basic_tuple_tail( - const std::tuple& item - ) - { - return tuple_subset(item, pop_front_t>{}); - } - - template - auto tuple_flatten(const T& t) - { - return std::make_tuple(t); - } - - template - auto tuple_flatten( - const std::tuple& item, - std::index_sequence - ) - { - return std::tuple_cat(tuple_flatten(std::get(item))...); - } - - template - auto tuple_flatten(const std::tuple& item) - { - return tuple_flatten(item, std::index_sequence_for{}); - } - - template - struct tuple_head_helper - { - typedef T type; - static const type& get(const T& item) - { - return item; - } - }; - - template - struct tuple_head_helper> - { - typedef typename tuple_head_helper::type type; - static const type& get(const std::tuple& item) - { - return tuple_head_helper::get(std::get<0>(item)); - } - }; - - template struct alwaysbool { typedef bool type; }; - // one more structure for VS 2015 UP3 support workaround - template struct alwaysbool2 { typedef bool type; }; - - resizable_tensor& rt(); - - // The significance of a layer's backward method requiring forward's outputs is - // that such as layer can't have an in-place layer stacked on top of it because - // in-place layers overwrite the output of the layer they sit on top of. - template - constexpr auto backward_requires_forward_output( - layer_type& layer, - SUBNET& sub - ) -> typename alwaysbool::type - { - return true; - } - - template - constexpr auto backward_requires_forward_output( - layer_type& layer, - SUBNET& sub - ) -> typename alwaysbool::type - { - return false; - } - - template - constexpr auto backward_requires_forward_output( - layer_type& layer, - SUBNET& sub - ) -> typename alwaysbool::type - { - return true; - } - - template - constexpr auto backward_requires_forward_output( - layer_type& layer, - SUBNET& sub - ) -> typename alwaysbool::type - { - return false; - } - - template - constexpr auto has_inplace_backward( - layer_type& layer, - SUBNET& sub - ) -> typename alwaysbool2::type - { - return false; - } - - template - constexpr auto has_inplace_backward( - layer_type& layer, - SUBNET& sub - ) -> typename alwaysbool2::type - { - return false; - } - - template - constexpr auto has_inplace_backward( - layer_type& layer, - SUBNET& sub - ) -> typename alwaysbool2::type - { - return true; - } - - template - constexpr auto has_inplace_backward( - layer_type& layer, - SUBNET& sub - ) -> typename alwaysbool2::type - { - return true; - } - - template - constexpr auto is_inplace_layer( - layer_type& layer, - const SUBNET& sub - ) -> typename alwaysbool2::type - { - return false; - } - - template - constexpr auto is_inplace_layer( - layer_type& layer, - const SUBNET& sub - ) -> typename alwaysbool::type - { - return true; - } - - template - auto call_layer_backward( - layer_type& layer, - const tensor& computed_output, - const tensor& gradient_input, - SUBNET& sub, - tensor& params_grad - ) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad)) - { - layer.backward(computed_output,gradient_input,sub,params_grad); - } - - template - auto call_layer_backward( - layer_type& layer, - const tensor& , - const tensor& gradient_input, - SUBNET& sub, - tensor& params_grad - ) -> decltype(layer.backward(gradient_input,sub,params_grad)) - { - layer.backward(gradient_input,sub,params_grad); - } - - template - auto call_layer_backward( - layer_type& layer, - const tensor& computed_output, - const tensor& gradient_input, - SUBNET& sub, - tensor& params_grad - ) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad)) - { - layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad); - } - - template - auto call_layer_backward( - layer_type& layer, - const tensor& , - const tensor& gradient_input, - SUBNET& sub, - tensor& params_grad - ) -> decltype(layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad)) - { - layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad); - } - - - template - auto call_layer_forward( - layer_type& layer, - const SUBNET& sub, - tensor& /*data_output*/ - ) -> decltype(layer.forward(sub,rt())) - { - // This overload of call_layer_forward() is here because this template - // naturally gets instantiated but only on code paths that never get executed. - // So rather than writing a bunch of hard to read template magic around call - // sites we just have this overload that doesn't do anything (and an assert to - // make sure that's the case). - DLIB_CASSERT(false, "This should never happen"); - } - - template - auto call_layer_forward( - layer_type& layer, - const SUBNET& sub, - resizable_tensor& data_output - ) -> decltype(layer.forward(sub,data_output)) - { - layer.forward(sub,data_output); - } - - template - auto call_layer_forward( - layer_type& layer, - const SUBNET& sub, - tensor& data_output - ) -> decltype(layer.forward_inplace(sub.get_output(),data_output)) - { - layer.forward_inplace(sub.get_output(),data_output); - } - - template - auto call_layer_forward( - layer_type& layer, - const SUBNET& sub, - resizable_tensor& data_output - ) -> decltype(layer.forward_inplace(sub.get_output(),data_output)) - { - if (!have_same_dimensions(data_output, sub.get_output())) - data_output.copy_size(sub.get_output()); - layer.forward_inplace(sub.get_output(),static_cast(data_output)); - } - - - } // end namespace impl - - template - auto tuple_head ( - const std::tuple& item - ) - { - return impl::tuple_head_helper>::get(item); - } - - template - auto tuple_tail( - const std::tuple& item - ) - { - return impl::basic_tuple_tail(impl::tuple_flatten(item)); - } - - inline std::tuple<> tuple_tail( - const std::tuple<>& item - ) - { - return item; - } -// ---------------------------------------------------------------------------------------- - - template - class sstack - { - public: - typedef T value_type; - - sstack() = delete; - - sstack ( - T* data_, - size_t s - ) : data(data_), mysize(s) {} - - const T& top() const - { - DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack"); - return *data; - } - T& top() - { - DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack"); - return *data; - } - - size_t size() const { return mysize; } - - sstack pop(size_t num=1) - { - DLIB_CASSERT(num <= size(), "You can't pop more things from the stack than it has in it."); - return sstack(data+num, mysize-num); - } - - private: - - T* data; - size_t mysize; - }; - - template - sstack make_sstack(std::vector& item) - { - return sstack(item.data(), item.size()); - } - -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- - - namespace dimpl - { - template - class subnet_wrapper - { - /*! - WHAT THIS OBJECT REPRESENTS - This is a tool that makes an add_layer or add_loss_layer object - expose only the part of its interface defined by the SUBNET - type in layers_abstract.h. This way, when we pass subnetwork - objects to the layer callbacks those callbacks won't be able to - interact with the subnetworks in a way other than specified - by the SUBNET interface spec. - - We also allow the top layer of a subnet_wrapper stack to call the - private_get_output() and private_get_gradient_input() functions. This - way, layers that have had their output/gradient overwritten by in-place - layers can only be accessed from the in-place layers that sit directly - on top of them since those in-place layers are the only layers that - know how to interact with them properly. - !*/ - - public: - subnet_wrapper(const subnet_wrapper&) = delete; - subnet_wrapper& operator=(const subnet_wrapper&) = delete; - - subnet_wrapper(T& l_, unsigned int sef) : l(l_),_sample_expansion_factor(sef) {} - // Not much here because in this case T is one of the input layer types - // that doesn't have anything in it. - typedef T layer_details_type; - typedef T input_layer_type; - const layer_details_type& layer_details() const { return l; } - const input_layer_type& input_layer() const { return l; } - input_layer_type& input_layer() { return l; } - unsigned int sample_expansion_factor() const { return _sample_expansion_factor; } - private: - T& l; - unsigned int _sample_expansion_factor; - }; - - template - class subnet_wrapper::value>::type> - { - - public: - subnet_wrapper(const subnet_wrapper&) = delete; - subnet_wrapper& operator=(const subnet_wrapper&) = delete; - - typedef T wrapped_type; - const static size_t num_computational_layers = T::num_computational_layers; - const static size_t num_layers = T::num_layers; - typedef typename T::layer_details_type layer_details_type; - typedef typename T::input_layer_type input_layer_type; - - subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {} - - const tensor& get_output() const { return l.private_get_output(); } - tensor& get_gradient_input() { return l.private_get_gradient_input(); } - - const layer_details_type& layer_details() const { return l.layer_details(); } - - const subnet_wrapper& subnet() const { return subnetwork; } - subnet_wrapper& subnet() { return subnetwork; } - unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); } - - const input_layer_type& input_layer() const { return l.input_layer(); } - input_layer_type& input_layer() { return l.input_layer(); } - - private: - T& l; - subnet_wrapper subnetwork; - }; - - template - class subnet_wrapper::value>::type> - { - - public: - subnet_wrapper(const subnet_wrapper&) = delete; - subnet_wrapper& operator=(const subnet_wrapper&) = delete; - - typedef T wrapped_type; - const static size_t num_computational_layers = T::num_computational_layers; - const static size_t num_layers = T::num_layers; - typedef typename T::layer_details_type layer_details_type; - typedef typename T::input_layer_type input_layer_type; - - subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {} - - const tensor& get_output() const { return l.get_output(); } - tensor& get_gradient_input() { return l.get_gradient_input(); } - - const layer_details_type& layer_details() const { return l.layer_details(); } - - const subnet_wrapper& subnet() const { return subnetwork; } - subnet_wrapper& subnet() { return subnetwork; } - unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); } - - const input_layer_type& input_layer() const { return l.input_layer(); } - input_layer_type& input_layer() { return l.input_layer(); } - - private: - T& l; - subnet_wrapper subnetwork; - }; - } - -// ---------------------------------------------------------------------------------------- - - enum class zero_gradients : uint8_t - { - no = 0, - yes = 1 - }; - -// ---------------------------------------------------------------------------------------- - - template - class add_layer; - - template - void serialize(const add_layer& item, std::ostream& out); - template - void deserialize(add_layer& item, std::istream& in); - - template - struct is_nonloss_layer_type> : std::true_type {}; - - template - class add_layer::value>::type> - { - public: - typedef LAYER_DETAILS layer_details_type; - typedef SUBNET subnet_type; - typedef typename subnet_type::input_layer_type input_layer_type; - typedef typename subnet_type::input_type input_type; - const static size_t num_layers = subnet_type::num_layers + 1; - const static size_t num_computational_layers = subnet_type::num_computational_layers + 1; - - add_layer( - ): - subnetwork(new subnet_type()), - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false) - { - if (this_layer_operates_inplace()) - subnetwork->disable_output_and_gradient_getters(); - } - - add_layer(const add_layer& item) - { - details = item.details; - subnetwork.reset(new subnet_type(*item.subnetwork)); - this_layer_setup_called = item.this_layer_setup_called; - gradient_input_is_stale = item.gradient_input_is_stale; - get_output_and_gradient_input_disabled = item.get_output_and_gradient_input_disabled; - x_grad = item.x_grad; - cached_output = item.cached_output; - params_grad = item.params_grad; - temp_tensor = item.temp_tensor; - } - add_layer& operator=(const add_layer& item) { add_layer(item).swap(*this); return *this;} - add_layer(add_layer&& item) : add_layer() { swap(item); } - add_layer& operator=(add_layer&& item) { swap(item); return *this; } - - template - friend class add_layer; - template - friend class dimpl::subnet_wrapper; - template - friend class add_tag_layer; - template class T, typename U> - friend class add_skip_layer; - template class L, typename S> - friend class repeat; - - // Allow copying networks from one to another as long as their corresponding - // layers can be constructed from each other. - template - add_layer( - const add_layer& item - ) : - details(item.layer_details()), - subnetwork(new subnet_type(item.subnet())), - this_layer_setup_called(item.this_layer_setup_called), - gradient_input_is_stale(item.gradient_input_is_stale), - get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled), - x_grad(item.x_grad), - cached_output(item.cached_output) - { - if (this_layer_operates_inplace()) - subnetwork->disable_output_and_gradient_getters(); - } - - template - add_layer( - const LAYER_DETAILS& layer_det, - T&& ...args - ) : - details(layer_det), - subnetwork(new subnet_type(std::forward(args)...)), - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false) - { - if (this_layer_operates_inplace()) - subnetwork->disable_output_and_gradient_getters(); - } - - template - struct disable_forwarding_constr - { - const static bool value = std::is_constructible::value; - }; - template - struct disable_forwarding_constr,U...> - { - const static bool value = disable_forwarding_constr::type...>::value; - }; - template - struct disable_forwarding_constr,U...> - { - const static bool value = disable_forwarding_constr::type>::value; - }; - template - struct disable_forwarding_constr,U...> - { - const static bool value = true; - }; - template - struct disable_forwarding_constr> - { - const static bool value = true; - }; - - template < - typename ...T, - typename = typename std::enable_if::type...>::value>::type - > - add_layer( - T&& ...args - ) : - subnetwork(new subnet_type(std::forward(args)...)), - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false) - { - if (this_layer_operates_inplace()) - subnetwork->disable_output_and_gradient_getters(); - } - - template - add_layer( - LAYER_DETAILS&& layer_det, - T&& ...args - ) : - details(std::move(layer_det)), - subnetwork(new subnet_type(std::forward(args)...)), - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false) - { - if (this_layer_operates_inplace()) - subnetwork->disable_output_and_gradient_getters(); - } - - template - add_layer( - const std::tuple& layer_det, - T&& ...args - ) : - details(tuple_head(layer_det)), - subnetwork(new subnet_type(tuple_tail(layer_det),std::forward(args)...)), - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false) - { - if (this_layer_operates_inplace()) - subnetwork->disable_output_and_gradient_getters(); - } - - template - add_layer( - std::tuple<>, - const std::tuple& layer_det, - T&& ...args - ) : add_layer(layer_det,args...) { } - - add_layer ( - std::tuple<> - ) : add_layer() {} - - template - add_layer( - std::tuple<>, - LAYER_DETAILS&& layer_det, - T&& ...args - ) : add_layer(layer_det, args...) { } - - template - void to_tensor ( - forward_iterator ibegin, - forward_iterator iend, - resizable_tensor& data - ) const - { - subnetwork->to_tensor(ibegin,iend,data); - } - - template - const tensor& operator() ( - forward_iterator ibegin, - forward_iterator iend - ) - { - to_tensor(ibegin,iend,temp_tensor); - return forward(temp_tensor); - } - - - const tensor& operator() (const input_type& x) - { - return (*this)(&x, &x+1); - } - - const tensor& forward(const tensor& x) - { - subnetwork->forward(x); - const dimpl::subnet_wrapper wsub(*subnetwork); - if (!this_layer_setup_called) - { - details.setup(wsub); - this_layer_setup_called = true; - } - if (this_layer_operates_inplace()) - impl::call_layer_forward(details, wsub, private_get_output()); - else - impl::call_layer_forward(details, wsub, cached_output); - - gradient_input_is_stale = true; - return private_get_output(); - } - - private: - tensor& private_get_output() const - { - if (const_cast(*this).this_layer_operates_inplace()) - return subnetwork->private_get_output(); - else - return const_cast(cached_output); - } - tensor& private_get_gradient_input() - { - if (this_layer_operates_inplace()) - { - return subnetwork->private_get_gradient_input(); - } - else - { - if (gradient_input_is_stale) - { - gradient_input_is_stale = false; - x_grad.copy_size(private_get_output()); - x_grad = 0; - } - return x_grad; - } - } - void disable_output_and_gradient_getters ( - ) { get_output_and_gradient_input_disabled = true; } - public: - const tensor& get_output() const - { - if (get_output_and_gradient_input_disabled) - throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it."); - return private_get_output(); - } - tensor& get_gradient_input() - { - if (get_output_and_gradient_input_disabled) - throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it."); - return private_get_gradient_input(); - } - - const tensor& get_final_data_gradient( - ) const { return subnetwork->get_final_data_gradient(); } - - void back_propagate_error( - const tensor& x, - zero_gradients zero_grads = zero_gradients::yes - ) - { - back_propagate_error(x, private_get_gradient_input(), zero_grads); - } - void back_propagate_error( - const tensor& x, - const tensor& gradient_input, - zero_gradients zero_grads = zero_gradients::yes - ) - { - dimpl::subnet_wrapper wsub(*subnetwork); - params_grad.copy_size(details.get_layer_params()); - impl::call_layer_backward(details, private_get_output(), - gradient_input, wsub, static_cast(params_grad)); - - subnetwork->back_propagate_error(x, zero_grads); - - // zero out get_gradient_input() - gradient_input_is_stale = zero_grads == zero_gradients::yes; - } - - template - void update_parameters(sstack solvers, double learning_rate) - { - DLIB_CASSERT(solvers.size()>=num_computational_layers); - // Don't try to adjust the parameters if this layer doesn't have any or the - // learning rate is disabled for this layer. - if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0) - { - const tensor& step = solvers.top()(learning_rate, details, static_cast(params_grad)); - tt::add(details.get_layer_params(), details.get_layer_params(), step); - } - subnetwork->update_parameters(solvers.pop(), learning_rate); - } - - template - void update_parameters(std::vector& solvers, double learning_rate) - { - update_parameters(make_sstack(solvers), learning_rate); - } - - const tensor& get_parameter_gradient( - ) const { return params_grad; } - - tensor& get_parameter_gradient ( - ) { return params_grad; } - - const subnet_type& subnet() const { return *subnetwork; } - subnet_type& subnet() { return *subnetwork; } - - const input_layer_type& input_layer() const { return subnet().input_layer(); } - input_layer_type& input_layer() { return subnet().input_layer(); } - - const layer_details_type& layer_details() const { return details; } - layer_details_type& layer_details() { return details; } - - unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } - - void set_gradient_inputs_to_zero() - { - gradient_input_is_stale = true; - subnetwork->set_gradient_inputs_to_zero(); - } - - void clean() - { - x_grad.clear(); - cached_output.clear(); - params_grad.clear(); - temp_tensor.clear(); - gradient_input_is_stale = true; - subnetwork->clean(); - call_clean_method_if_exists(details); - } - - friend void serialize(const add_layer& item, std::ostream& out) - { - int version = 2; - serialize(version, out); - serialize(*item.subnetwork, out); - serialize(item.details, out); - serialize(item.this_layer_setup_called, out); - serialize(item.gradient_input_is_stale, out); - serialize(item.get_output_and_gradient_input_disabled, out); - serialize(item.x_grad, out); - serialize(item.cached_output, out); - serialize(item.params_grad, out); - } - - friend void deserialize(add_layer& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (!(1 <= version && version <= 2)) - throw serialization_error("Unexpected version found while deserializing dlib::add_layer."); - deserialize(*item.subnetwork, in); - deserialize(item.details, in); - deserialize(item.this_layer_setup_called, in); - deserialize(item.gradient_input_is_stale, in); - deserialize(item.get_output_and_gradient_input_disabled, in); - deserialize(item.x_grad, in); - deserialize(item.cached_output, in); - if (version == 2) - deserialize(item.params_grad, in); - } - - friend std::ostream& operator<< (std::ostream& out, const add_layer& item) - { - int min_length = 0; - item.print(out, 0, min_length); - return out; - } - - void print (std::ostream& out, unsigned long idx, int& min_length) const - { - out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n"; - subnet().print(out, idx+1, min_length); - } - - private: - - bool this_layer_operates_inplace( - ) - { - // This layer can run in-place if it's an in-place capable layer and also if - // the layer it's on top of doesn't need its own output tensor (since in-place - // layers overwrite that tensor) - return impl::is_inplace_layer(details, *subnetwork) && !subnetwork->this_layer_requires_forward_output(); - } - bool this_layer_requires_forward_output( - ) - { - return impl::backward_requires_forward_output(details, *subnetwork); - } - - void swap(add_layer& item) - { - std::swap(subnetwork,item.subnetwork); - std::swap(details, item.details); - std::swap(this_layer_setup_called, item.this_layer_setup_called); - std::swap(gradient_input_is_stale, item.gradient_input_is_stale); - std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled); - std::swap(x_grad, item.x_grad); - std::swap(cached_output, item.cached_output); - std::swap(params_grad, item.params_grad); - } - - - LAYER_DETAILS details; - std::unique_ptr subnetwork; - bool this_layer_setup_called; - bool gradient_input_is_stale; - bool get_output_and_gradient_input_disabled; - // Note that if this_layer_operates_inplace()==true then x_grad and cached_output - // are not used at all. Instead, this layer uses these variables from the lower - // layer. - resizable_tensor x_grad; - resizable_tensor cached_output; - - resizable_tensor params_grad; - - // temp_tensor doesn't logically contribute to the state of this object. - // It is here only to prevent it from being reallocated over and over. - resizable_tensor temp_tensor; - - }; - - template - struct is_add_layer> : std::true_type {}; - template - struct is_add_layer> : std::true_type {}; - template - struct is_add_layer&> : std::true_type {}; - template - struct is_add_layer&> : std::true_type {}; - -// ---------------------------------------------------------------------------------------- - -// This version of add_layer handles the special case where the subnetwork being given is -// just an input layer object. - template - class add_layer - { - public: - typedef LAYER_DETAILS layer_details_type; - typedef INPUT_LAYER subnet_type; - typedef INPUT_LAYER input_layer_type; - typedef typename INPUT_LAYER::input_type input_type; - const static size_t num_layers = 2; - const static size_t num_computational_layers = 1; - - add_layer( - ): - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false), - _sample_expansion_factor(0) - {} - - add_layer(const add_layer&) = default; - add_layer(add_layer&& item) : add_layer() { swap(item); } - add_layer& operator=(const add_layer&) = default; - add_layer& operator=(add_layer&& item) { swap(item); return *this; } - - template - friend class add_layer; - template - friend class dimpl::subnet_wrapper; - template - friend class add_tag_layer; - template class T, typename U> - friend class add_skip_layer; - template class L, typename S> - friend class repeat; - - // Allow copying networks from one to another as long as their corresponding - // layers can be constructed from each other. - template - add_layer( - const add_layer& item - ): - input_layer_(item.subnet()), - details(item.layer_details()), - this_layer_setup_called(item.this_layer_setup_called), - gradient_input_is_stale(item.gradient_input_is_stale), - get_output_and_gradient_input_disabled(false), - _sample_expansion_factor(item._sample_expansion_factor), - x_grad(item.x_grad), - cached_output(item.cached_output), - grad_final(item.grad_final) - { - } - - add_layer( - const LAYER_DETAILS& layer_det - ) : - details(layer_det), - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false), - _sample_expansion_factor(0) - {} - - add_layer( - const INPUT_LAYER& il - ) : - input_layer_(il), - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false), - _sample_expansion_factor(0) - {} - - add_layer( - LAYER_DETAILS&& layer_det - ) : - details(std::move(layer_det)), - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false), - _sample_expansion_factor(0) - {} - - add_layer( - LAYER_DETAILS layer_det, - INPUT_LAYER il - ) : - details(std::move(layer_det)), - input_layer_(std::move(il)), - this_layer_setup_called(false), - gradient_input_is_stale(true), - get_output_and_gradient_input_disabled(false), - _sample_expansion_factor(0) - {} - - add_layer( - std::tuple<>, - const LAYER_DETAILS& layer_det - ) : add_layer(layer_det) {} - - add_layer( - std::tuple<>, - LAYER_DETAILS&& layer_det - ) : add_layer(layer_det) {} - - add_layer( - std::tuple<>, - LAYER_DETAILS layer_det, - INPUT_LAYER il - ) : add_layer(layer_det,il) {} - - add_layer( - const std::tuple& layer_det - ) : add_layer(tuple_head(layer_det)) {} - - add_layer( - const std::tuple& layer_det, - INPUT_LAYER il - ) : add_layer(tuple_head(layer_det),il) {} - - template - void to_tensor ( - forward_iterator ibegin, - forward_iterator iend, - resizable_tensor& data - ) const - { - input_layer_.to_tensor(ibegin, iend, data); - // make sure the input layer's to_tensor() function is implemented properly. - DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend), - "The input layer can't produce fewer output tensors than there are inputs."); - DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0, - "The number of tensors produced by the input layer must be an integer multiple of the number of input objects."); - - _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend); - data.async_copy_to_device(); - } - - - template - const tensor& operator() ( - forward_iterator ibegin, - forward_iterator iend - ) - { - to_tensor(ibegin,iend,temp_tensor); - return forward(temp_tensor); - } - - - const tensor& operator() (const input_type& x) - { - return (*this)(&x, &x+1); - } - - const tensor& forward (const tensor& x) - { - DLIB_CASSERT(sample_expansion_factor() != 0, "You must call to_tensor() before this function can be used."); - DLIB_CASSERT(x.num_samples()%sample_expansion_factor() == 0); - subnet_wrapper wsub(x, grad_final, _sample_expansion_factor); - if (!this_layer_setup_called) - { - details.setup(wsub); - this_layer_setup_called = true; - } - impl::call_layer_forward(details, wsub, cached_output); - gradient_input_is_stale = true; - return private_get_output(); - } - - private: - tensor& private_get_output() const { return const_cast(cached_output); } - tensor& private_get_gradient_input() - { - if (gradient_input_is_stale) - { - gradient_input_is_stale = false; - x_grad.copy_size(private_get_output()); - x_grad = 0; - } - return x_grad; - } - void disable_output_and_gradient_getters ( - ) { get_output_and_gradient_input_disabled = true; } - public: - const tensor& get_output() const - { - if (get_output_and_gradient_input_disabled) - throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it."); - return private_get_output(); - } - tensor& get_gradient_input() - { - if (get_output_and_gradient_input_disabled) - throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it."); - return private_get_gradient_input(); - } - - const tensor& get_final_data_gradient( - ) const { return grad_final; } - - void back_propagate_error( - const tensor& x, - zero_gradients zero_grads = zero_gradients::yes - ) - { - back_propagate_error(x, private_get_gradient_input(), zero_grads); - } - void back_propagate_error( - const tensor& x, - const tensor& gradient_input, - zero_gradients zero_grads = zero_gradients::yes - ) - { - // make sure grad_final is initialized to 0 - if (!have_same_dimensions(x, grad_final)) - grad_final.copy_size(x); - grad_final = 0; - - subnet_wrapper wsub(x, grad_final, _sample_expansion_factor); - params_grad.copy_size(details.get_layer_params()); - impl::call_layer_backward(details, private_get_output(), - gradient_input, wsub, static_cast(params_grad)); - - // zero out get_gradient_input() - gradient_input_is_stale = zero_grads == zero_gradients::yes; - } - - template - void update_parameters(sstack solvers, double learning_rate) - { - DLIB_CASSERT(solvers.size()>=num_computational_layers); - // Don't try to adjust the parameters if this layer doesn't have any or the - // learning rate is disabled for this layer. - if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0) - { - const tensor& step = solvers.top()(learning_rate, details, static_cast(params_grad)); - tt::add(details.get_layer_params(), details.get_layer_params(), step); - } - } - - template - void update_parameters(std::vector& solvers, double learning_rate) - { - update_parameters(make_sstack(solvers), learning_rate); - } - - const tensor& get_parameter_gradient( - ) const { return params_grad; } - - tensor& get_parameter_gradient ( - ) { return params_grad; } - - const subnet_type& subnet() const { return input_layer_; } - subnet_type& subnet() { return input_layer_; } - - const subnet_type& input_layer() const { return input_layer_; } - subnet_type& input_layer() { return input_layer_; } - - const layer_details_type& layer_details() const { return details; } - layer_details_type& layer_details() { return details; } - - unsigned int sample_expansion_factor() const { return _sample_expansion_factor; } - - void set_gradient_inputs_to_zero() - { - gradient_input_is_stale = true; - } - - void clean() - { - x_grad.clear(); - grad_final.clear(); - cached_output.clear(); - params_grad.clear(); - temp_tensor.clear(); - gradient_input_is_stale = true; - call_clean_method_if_exists(details); - } - - friend void serialize(const add_layer& item, std::ostream& out) - { - int version = 3; - serialize(version, out); - serialize(item.input_layer_, out); - serialize(item.details, out); - serialize(item.this_layer_setup_called, out); - serialize(item.gradient_input_is_stale, out); - serialize(item.get_output_and_gradient_input_disabled, out); - serialize(item.x_grad, out); - serialize(item.cached_output, out); - serialize(item.grad_final, out); - serialize(item._sample_expansion_factor, out); - } - - friend void deserialize(add_layer& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (!(2 <= version && version <= 3)) - throw serialization_error("Unexpected version found while deserializing dlib::add_layer."); - deserialize(item.input_layer_, in); - deserialize(item.details, in); - deserialize(item.this_layer_setup_called, in); - deserialize(item.gradient_input_is_stale, in); - deserialize(item.get_output_and_gradient_input_disabled, in); - deserialize(item.x_grad, in); - deserialize(item.cached_output, in); - deserialize(item.grad_final, in); - if (version >= 3) - deserialize(item._sample_expansion_factor, in); - else - item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here. - } - - friend std::ostream& operator<< (std::ostream& out, const add_layer& item) - { - int min_length = 0; - item.print(out, 0, min_length); - return out; - } - - void print (std::ostream& out, unsigned long idx, int& min_length) const - { - out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n"; - - // Don't print the repeat_input_layer since it doesn't exist from the user's - // point of view. It's just an artifact of how repeat<> works. - if (!std::is_same::value) - out << "layer<" << idx+1 << ">\t" << subnet() << "\n"; - } - - private: - - bool this_layer_requires_forward_output( - ) - { - subnet_wrapper wsub(grad_final, grad_final, _sample_expansion_factor); - return impl::backward_requires_forward_output(details, wsub); - } - - class subnet_wrapper - { - public: - subnet_wrapper(const tensor& x_, resizable_tensor& grad_final_, unsigned int sef) : - x(x_), grad_final(grad_final_), _sample_expansion_factor(sef) {} - - subnet_wrapper(const subnet_wrapper&) = delete; - subnet_wrapper& operator=(const subnet_wrapper&) = delete; - - unsigned int sample_expansion_factor() const { return _sample_expansion_factor;} - const tensor& get_output() const { return x; } - tensor& get_gradient_input() - { - if (!have_same_dimensions(x, grad_final)) - { - grad_final.copy_size(x); - grad_final = 0; - } - return grad_final; - } - - private: - const tensor& x; - resizable_tensor& grad_final; - unsigned int _sample_expansion_factor; - }; - - void swap(add_layer& item) - { - std::swap(input_layer_, item.input_layer_); - std::swap(details, item.details); - std::swap(this_layer_setup_called, item.this_layer_setup_called); - std::swap(gradient_input_is_stale, item.gradient_input_is_stale); - std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled); - std::swap(x_grad, item.x_grad); - std::swap(cached_output, item.cached_output); - std::swap(grad_final, item.grad_final); - std::swap(_sample_expansion_factor, item._sample_expansion_factor); - } - - subnet_type input_layer_; - LAYER_DETAILS details; - bool this_layer_setup_called; - bool gradient_input_is_stale; - bool get_output_and_gradient_input_disabled; - mutable unsigned int _sample_expansion_factor; - resizable_tensor x_grad; - resizable_tensor cached_output; - resizable_tensor grad_final; - - // The following 2 objects don't logically contribute to the state of this class. - // They are only here to prevent them from being reallocated over and over in - // member functions. - resizable_tensor params_grad; - resizable_tensor temp_tensor; - }; - -// ---------------------------------------------------------------------------------------- - - template - class add_tag_layer; - - template class tag> - struct tag_id - { - const static unsigned long id = tag::id; - }; - - template - class add_tag_layer::value>::type> - { - public: - typedef SUBNET subnet_type; - typedef typename subnet_type::input_type input_type; - typedef typename subnet_type::input_layer_type input_layer_type; - typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. - const static size_t num_layers = subnet_type::num_layers + 1; - const static size_t num_computational_layers = subnet_type::num_computational_layers; - const static unsigned long id = ID; - - add_tag_layer() {}; - add_tag_layer(const add_tag_layer&) = default; - add_tag_layer(add_tag_layer&&) = default; - add_tag_layer& operator=(add_tag_layer&&) = default; - add_tag_layer& operator=(const add_tag_layer&) = default; - - template - add_tag_layer( - const add_tag_layer& item - ) : subnetwork(item.subnet()) - {} - - template - add_tag_layer( - T ...args - ) : - subnetwork(std::move(args)...) - { - } - - template - void to_tensor ( - forward_iterator ibegin, - forward_iterator iend, - resizable_tensor& data - ) const - { - subnetwork.to_tensor(ibegin,iend,data); - } - - template - const tensor& operator() ( - forward_iterator ibegin, - forward_iterator iend - ) - { - return subnetwork(ibegin,iend); - } - - const tensor& operator() (const input_type& x) - { - return subnetwork(x); - } - - const tensor& forward(const tensor& x) - { - return subnetwork.forward(x); - } - - const tensor& get_output() const { return subnetwork.get_output(); } - - tensor& get_gradient_input() - { - return subnetwork.get_gradient_input(); - } - - const tensor& get_final_data_gradient( - ) const { return subnetwork.get_final_data_gradient(); } - - void back_propagate_error( - const tensor& x, - zero_gradients zero_grads = zero_gradients::yes - ) - { - subnetwork.back_propagate_error(x, zero_grads); - } - void back_propagate_error( - const tensor& x, - const tensor& gradient_input, - zero_gradients zero_grads = zero_gradients::yes - ) - { - subnetwork.back_propagate_error(x,gradient_input, zero_grads); - } - - template - void update_parameters(sstack solvers, double learning_rate) - { - subnetwork.update_parameters(solvers, learning_rate); - } - - template - void update_parameters(std::vector& solvers, double learning_rate) - { - update_parameters(make_sstack(solvers), learning_rate); - } - - const tensor& get_parameter_gradient( - ) const { return params_grad; } - - tensor& get_parameter_gradient ( - ) { return params_grad; } - - const subnet_type& subnet() const { return subnetwork; } - subnet_type& subnet() { return subnetwork; } - - const input_layer_type& input_layer() const { return subnet().input_layer(); } - input_layer_type& input_layer() { return subnet().input_layer(); } - - unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } - - void set_gradient_inputs_to_zero() - { - subnetwork.set_gradient_inputs_to_zero(); - } - - void clean() - { - subnetwork.clean(); - } - - friend void serialize(const add_tag_layer& item, std::ostream& out) - { - int version = 1; - serialize(version, out); - serialize(item.subnetwork, out); - } - - friend void deserialize(add_tag_layer& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (version != 1) - throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer."); - deserialize(item.subnetwork, in); - } - - friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item) - { - int min_length = 0; - item.print(out, 0, min_length); - return out; - } - - void print (std::ostream& out, unsigned long idx, int& min_length) const - { - out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << "tag" << ID << "\n"; - subnet().print(out, idx+1, min_length); - } - - private: - - template - friend class add_layer; - template - friend class dimpl::subnet_wrapper; - template - friend class add_tag_layer; - template class T, typename U> - friend class add_skip_layer; - template class L, typename S> - friend class repeat; - - // You wouldn't put a tag on a layer if you didn't want to access its forward - // outputs. So this is always true. - bool this_layer_requires_forward_output( - ) { return true; } - - void disable_output_and_gradient_getters ( - ) - { - // This should never happen because only inplace layers call - // disable_output_and_gradient_getters(), however, putting a tag layer right - // before an inplace layer basically means you don't want the following layer - // to operate in place. So the inplace layer should turn itself into an - // out-of-place layer and not call disable_output_and_gradient_getters(). - DLIB_CASSERT(false,"This should never happen"); - } - - tensor& private_get_output() const - { return subnetwork.private_get_output(); } - tensor& private_get_gradient_input() - { return subnetwork.private_get_gradient_input(); } - - subnet_type subnetwork; - - // This member doesn't logically contribute to the state of the object since it is - // always empty. It's just here so we can have the get_parameter_gradient() methods - // which have to return something. So they return this empty tensor. - resizable_tensor params_grad; - }; - -// ---------------------------------------------------------------------------------------- - - template - struct decorator_repeat_group - { - decorator_repeat_group( - T&& ...args - ) : data(std::forward(args)...) {} - - std::tuple data; - }; - template - decorator_repeat_group repeat_group ( - T&& ...args - ) - { - return decorator_repeat_group(std::forward(args)...); - } - - template < - size_t num, - template class REPEATED_LAYER, - typename SUBNET - > - class repeat - { - static_assert(num > 0, "You can't have a layer repeated 0 times."); - public: - typedef SUBNET subnet_type; - typedef typename SUBNET::input_type input_type; - typedef typename subnet_type::input_layer_type input_layer_type; - typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. - const static size_t comp_layers_in_each_group = (REPEATED_LAYER::num_computational_layers-SUBNET::num_computational_layers); - const static size_t comp_layers_in_repeated_group = comp_layers_in_each_group*num; - const static size_t num_computational_layers = comp_layers_in_repeated_group + SUBNET::num_computational_layers; - - const static size_t layers_in_each_group = (REPEATED_LAYER::num_layers-SUBNET::num_layers); - const static size_t layers_in_repeated_group = layers_in_each_group*num; - const static size_t num_layers = subnet_type::num_layers + layers_in_repeated_group; - - - typedef REPEATED_LAYER repeated_layer_type; - - repeat( - ) : - details(num) - { - } - - size_t num_repetitions ( - ) const { return num; } - - const repeated_layer_type& get_repeated_layer ( - size_t i - ) const - { - DLIB_CASSERT(i < num_repetitions()); - return details[i]; - } - - repeated_layer_type& get_repeated_layer ( - size_t i - ) - { - DLIB_CASSERT(i < num_repetitions()); - return details[i]; - } - - repeat(const repeat&) = default; - repeat(repeat&&) = default; - repeat& operator=(repeat&&) = default; - repeat& operator=(const repeat&) = default; - - template class T, typename U> - repeat( - const repeat& item - ) : - subnetwork(item.subnetwork) - { - for (auto&& d : item.details) - details.emplace_back(d); - } - - template - repeat( - T arg1, - U ...args2 - ): - details(num, std::move(arg1)), - subnetwork(std::move(args2)...) - { - } - - template - repeat( - decorator_repeat_group&& arg1, - U ...args2 - ): - details(num, arg1.data), - subnetwork(std::move(args2)...) - { - } - - template - repeat( - std::tuple<>, - T arg1, - U ...args2 - ): - details(num, std::move(arg1)), - subnetwork(std::move(args2)...) - { - } - - template - void to_tensor ( - forward_iterator ibegin, - forward_iterator iend, - resizable_tensor& data - ) const - { - subnetwork.to_tensor(ibegin,iend,data); - // call to_tensor on the networks in details just to populate the - // _sample_expansion_factor values in those networks. Other than that this - // call is a noop. - for (auto& d : details) - d.to_tensor(ibegin, iend, data); - } - - template - const tensor& operator() ( - forward_iterator ibegin, - forward_iterator iend - ) - { - to_tensor(ibegin,iend,temp_tensor); - return forward(temp_tensor); - } - - const tensor& operator() (const input_type& x) - { - return (*this)(&x, &x+1); - } - - const tensor& forward(const tensor& x) - { - subnetwork.forward(x); - details[details.size()-1].forward(subnetwork.get_output()); - for (long i = details.size()-2; i >= 0; --i) - details[i].forward(details[i+1].get_output()); - return private_get_output(); - } - - private: - tensor& private_get_output() const - { - return details[0].private_get_output(); - } - tensor& private_get_gradient_input() - { - return details[0].private_get_gradient_input(); - } - public: - const tensor& get_output() const - { - return details[0].get_output(); - } - tensor& get_gradient_input() - { - return details[0].get_gradient_input(); - } - - const tensor& get_final_data_gradient( - ) const { return subnetwork.get_final_data_gradient(); } - - const tensor& get_parameter_gradient( - ) const { return details[0].get_parameter_gradient(); } - - tensor& get_parameter_gradient ( - ) { return details[0].get_parameter_gradient(); } - - void back_propagate_error( - const tensor& x, - zero_gradients zero_grads = zero_gradients::yes - ) - { - back_propagate_error(x, private_get_gradient_input(), zero_grads); - } - void back_propagate_error( - const tensor& x, - const tensor& gradient_input, - zero_gradients zero_grads = zero_gradients::yes - ) - { - if (details.size() > 1) - { - details[0].back_propagate_error(details[1].get_output(), gradient_input, zero_grads); - for (size_t i = 1; i < details.size(); ++i) - { - if (i+1 < details.size()) - details[i].back_propagate_error(details[i+1].get_output(), details[i-1].get_final_data_gradient(), zero_grads); - else - details[i].back_propagate_error(subnetwork.get_output(), details[i-1].get_final_data_gradient(), zero_grads); - } - } - else - { - details[0].back_propagate_error(subnetwork.get_output(), gradient_input, zero_grads); - } - subnetwork.back_propagate_error(x, details.back().get_final_data_gradient(), zero_grads); - } - - template - void update_parameters(sstack solvers, double learning_rate) - { - for (size_t i = 0; i < details.size(); ++i) - details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),learning_rate); - subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),learning_rate); - } - - template - void update_parameters(std::vector& solvers, double learning_rate) - { - update_parameters(make_sstack(solvers), learning_rate); - } - - const subnet_type& subnet() const { return subnetwork; } - subnet_type& subnet() { return subnetwork; } - - const input_layer_type& input_layer() const { return subnet().input_layer(); } - input_layer_type& input_layer() { return subnet().input_layer(); } - - unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } - - void set_gradient_inputs_to_zero() - { - subnetwork.set_gradient_inputs_to_zero(); - } - - void clean() - { - temp_tensor.clear(); - subnetwork.clean(); - for (auto&& d : details) - d.clean(); - } - - friend void serialize(const repeat& item, std::ostream& out) - { - int version = 1; - serialize(version, out); - serialize(item.details, out); - serialize(item.subnetwork, out); - } - - friend void deserialize(repeat& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (version != 1) - throw serialization_error("Unexpected version found while deserializing dlib::repeat."); - deserialize(item.details, in); - deserialize(item.subnetwork, in); - } - - friend std::ostream& operator<< (std::ostream& out, const repeat& item) - { - int min_length = 0; - item.print(out, 0, min_length); - return out; - } - - void print (std::ostream& out, unsigned long idx, int& min_length) const - { - for (size_t i = 0; i < num_repetitions(); ++i) - { - get_repeated_layer(i).print(out, idx, min_length); - idx += layers_in_each_group; - } - subnet().print(out, idx, min_length); - } - private: - - - template - friend class add_layer; - template - friend class dimpl::subnet_wrapper; - template - friend class add_tag_layer; - template class T, typename U> - friend class add_skip_layer; - template class L, typename S> - friend class repeat; - - bool this_layer_requires_forward_output( - ) - { - return details[0].this_layer_requires_forward_output(); - } - - void disable_output_and_gradient_getters ( - ) - { - details[0].disable_output_and_gradient_getters(); - } - - - std::vector details; - subnet_type subnetwork; - - // temp_tensor doesn't logically contribute to the state of this class. - // It is here only to void needing to reallocate it over and over. - resizable_tensor temp_tensor; - }; - - template < - size_t num, - template class REPEATED_LAYER, - typename SUBNET - > - struct is_nonloss_layer_type> : std::true_type {}; - -// ---------------------------------------------------------------------------------------- - -// This version of add_tag_layer handles the special case where the subnetwork being given -// is just an input layer object. - template - class add_tag_layer - { - public: - typedef INPUT_LAYER subnet_type; - typedef typename subnet_type::input_type input_type; - typedef INPUT_LAYER input_layer_type; - typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. - const static size_t num_computational_layers = 0; - const static size_t num_layers = 2; - const static unsigned long id = ID; - - add_tag_layer():cached_output_ptr(nullptr),gradient_input_is_stale(true),_sample_expansion_factor(0) {} - - add_tag_layer(const add_tag_layer&) = default; - add_tag_layer& operator=(const add_tag_layer&) = default; - add_tag_layer(add_tag_layer&& item) : add_tag_layer() { swap(item); } - add_tag_layer& operator=(add_tag_layer&& item) { swap(item); return *this; } - - template - add_tag_layer( - const add_tag_layer& item - ) : input_layer_(item.subnet()), - cached_output(item.cached_output), - cached_output_ptr(nullptr), - grad_final(item.grad_final), - gradient_input_is_stale(item.gradient_input_is_stale), - _sample_expansion_factor(0) - {} - - template - add_tag_layer( - T ...args - ) : - input_layer_(std::move(args)...), - cached_output_ptr(nullptr), - gradient_input_is_stale(true), - _sample_expansion_factor(0) - { - } - - add_tag_layer ( - std::tuple<> - ) : - cached_output_ptr(nullptr), - gradient_input_is_stale(true), - _sample_expansion_factor(0) - {} - - template - void to_tensor ( - forward_iterator ibegin, - forward_iterator iend, - resizable_tensor& data - ) const - { - input_layer_.to_tensor(ibegin,iend,data); - - // make sure the input layer's to_tensor() function is implemented properly. - DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend), - "The input layer can't produce fewer output tensors than there are inputs."); - DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0, - "The number of tensors produced by the input layer must be an integer multiple of the number of input objects."); - - _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend); - data.async_copy_to_device(); - } - - unsigned int sample_expansion_factor() const { return _sample_expansion_factor; } - - template - const tensor& operator() ( - forward_iterator ibegin, - forward_iterator iend - ) - { - input_layer_.to_tensor(ibegin,iend,cached_output); - cached_output_ptr = nullptr; - return get_output(); - } - - const tensor& operator() (const input_type& x) - { - return (*this)(&x, &x+1); - } - - const tensor& forward(const tensor& x) - { - // If this tag is the first layer in one of the sub networks inside a repeat - // layer then we don't want it to be creating copies of x. This is because, we - // can just hold a pointer to x since the way repeat is constructed guarantees - // that x will have a lifetime larger than this pointer. - if (is_same_type::value) - cached_output_ptr = const_cast(&x); - else - cached_output = x; - gradient_input_is_stale = true; - return get_output(); - } - - const tensor& get_output() const - { - if (cached_output_ptr) - return *cached_output_ptr; - else - return cached_output; - } - - const tensor& get_final_data_gradient( - ) const { return grad_final; } - - tensor& get_gradient_input() - { - if (!have_same_dimensions(get_output(), grad_final) || - gradient_input_is_stale) - { - grad_final.copy_size(get_output()); - grad_final = 0; - gradient_input_is_stale = false; - } - return grad_final; - } - - - void back_propagate_error( - const tensor& /*x*/, - zero_gradients /*zero_grads*/ = zero_gradients::yes - ) - { - // nothing to do - } - void back_propagate_error( - const tensor& /*x*/, - const tensor& /*gradient_input*/, - zero_gradients /*zero_grads*/ = zero_gradients::yes - ) - { - // nothing to do - } - - template - void update_parameters(sstack /*solvers*/, double /*learning_rate*/) - { - // nothing to do - } - - template - void update_parameters(std::vector& solvers, double learning_rate) - { - update_parameters(make_sstack(solvers), learning_rate); - } - - const subnet_type& subnet() const { return input_layer_; } - subnet_type& subnet() { return input_layer_; } - - const input_layer_type& input_layer() const { return input_layer_; } - input_layer_type& input_layer() { return input_layer_; } - - void set_gradient_inputs_to_zero() - { - // nothing to do - } - - void clean() - { - grad_final.clear(); - cached_output.clear(); - cached_output_ptr = 0; - } - - friend void serialize(const add_tag_layer& item, std::ostream& out) - { - int version = 2; - serialize(version, out); - serialize(item.input_layer_, out); - serialize(item.cached_output, out); - serialize(item.grad_final, out); - serialize(item.gradient_input_is_stale, out); - serialize(item._sample_expansion_factor, out); - } - - friend void deserialize(add_tag_layer& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (!(1 <= version && version <= 2)) - throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer."); - deserialize(item.input_layer_, in); - deserialize(item.cached_output, in); - deserialize(item.grad_final, in); - deserialize(item.gradient_input_is_stale, in); - item.cached_output_ptr = nullptr; - if (version >= 2) - deserialize(item._sample_expansion_factor, in); - else - item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here. - - } - - friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item) - { - int min_length = 0; - item.print(out, 0, min_length); - return out; - } - - void print (std::ostream& out, unsigned long idx, int& min_length) const - { - out << "layer<"<\t"< works. - if (!std::is_same::value) - out << "layer<"<< idx+1 << ">\t" << subnet() << "\n"; - } - - private: - - template - friend class add_layer; - template - friend class dimpl::subnet_wrapper; - template - friend class add_tag_layer; - template class T, typename U> - friend class add_skip_layer; - template class L, typename S> - friend class repeat; - - // You woudln't put a tag on a layer if you didn't want to access its forward - // outputs. So this is always true. - bool this_layer_requires_forward_output( - ) { return true; } - - void disable_output_and_gradient_getters ( - ) - { - // This should never happen because only inplace layers call - // disable_output_and_gradient_getters(), however, putting a tag layer right - // before an inplace layer basically means you don't want the following layer - // to operate in place. So the inplace layer should turn itself into an - // out-of-place layer and not call disable_output_and_gradient_getters(). - DLIB_CASSERT(false,"This should never happen"); - } - - tensor& private_get_output() const - { return const_cast(get_output()); } - tensor& private_get_gradient_input() - { return get_gradient_input(); } - - void swap(add_tag_layer& item) - { - std::swap(input_layer_, item.input_layer_); - std::swap(cached_output, item.cached_output); - std::swap(cached_output_ptr, item.cached_output_ptr); - std::swap(grad_final, item.grad_final); - std::swap(gradient_input_is_stale, item.gradient_input_is_stale); - std::swap(_sample_expansion_factor, item._sample_expansion_factor); - } - - subnet_type input_layer_; - resizable_tensor cached_output; - tensor* cached_output_ptr; - resizable_tensor grad_final; - bool gradient_input_is_stale; - mutable unsigned int _sample_expansion_factor; - }; - - template - struct is_nonloss_layer_type> : std::true_type {}; - - -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- - - template - class add_loss_layer; - - class no_label_type - { - private: - // We don't want anyone making these no_label_type objects. They are here only to - // allow add_loss_layer::training_label_type and dnn_trainer::training_label_type - // to exist which avoids needing to overload add_loss_layer and dnn_trainer for - // supervised an unsupervised losses. It also can be a type to use in template - // metaprogramming to indicate "no label". So here we make the constructor private - // with the exception that add_loss_layer objects can make it (again, just to - // simplify add_loss_layer's implementation). - no_label_type(){}; - template friend class add_loss_layer; - template < typename net_type, typename solver_type > friend class dnn_trainer; - }; - -// ---------------------------------------------------------------------------------------- - - template - class add_loss_layer - { - template - struct get_loss_layer_training_label_type - { - typedef no_label_type type; - }; - template - struct get_loss_layer_training_label_type::type> - { - typedef typename T::training_label_type type; - }; - - template - struct get_loss_layer_output_label_type - { - typedef no_label_type type; - }; - template - struct get_loss_layer_output_label_type::type> - { - typedef typename T::output_label_type type; - }; - - public: - typedef LOSS_DETAILS loss_details_type; - typedef SUBNET subnet_type; - typedef typename subnet_type::input_type input_type; - typedef typename subnet_type::input_layer_type input_layer_type; - const static size_t num_layers = subnet_type::num_layers + 1; - // Note that the loss layer doesn't count as an additional computational layer. - const static size_t num_computational_layers = subnet_type::num_computational_layers; - typedef typename get_loss_layer_training_label_type::type training_label_type; - typedef typename get_loss_layer_output_label_type::type output_label_type; - - static_assert(is_nonloss_layer_type::value, - "SUBNET must be of type add_layer, add_skip_layer, or add_tag_layer."); - - - add_loss_layer() {}; - add_loss_layer(const add_loss_layer&) = default; - add_loss_layer& operator=(const add_loss_layer&) = default; - add_loss_layer(add_loss_layer&& item) : add_loss_layer() { swap(item); } - add_loss_layer& operator=(add_loss_layer&& item) { swap(item); return *this; } - - template - add_loss_layer( - const add_loss_layer& item - ) : - loss(item.loss_details()), - subnetwork(item.subnet()) - {} - - template - add_loss_layer( - const LOSS_DETAILS& layer_det, - T&& ...args - ) : - loss(layer_det), - subnetwork(std::forward(args)...) - { - } - - template - add_loss_layer( - LOSS_DETAILS&& layer_det, - T&& ...args - ) : - loss(std::move(layer_det)), - subnetwork(std::forward(args)...) - { - } - - template - struct disable_forwarding_constr - { - const static bool value = std::is_constructible::value; - }; - template - struct disable_forwarding_constr> - { - const static bool value = true; - }; - - template < - typename ...T, - typename = typename std::enable_if::type...>::value>::type - > - add_loss_layer( - T&& ...args - ) : - subnetwork(std::forward(args)...) - { - } - - template - void to_tensor ( - forward_iterator ibegin, - forward_iterator iend, - resizable_tensor& data - ) const - { - subnetwork.to_tensor(ibegin,iend,data); - } - - unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } - - template - void operator() ( - const tensor& x, - output_iterator obegin - ) - { - subnetwork.forward(x); - const dimpl::subnet_wrapper wsub(subnetwork); - loss.to_label(x, wsub, obegin); - } - - template - void operator() ( - forward_iterator ibegin, - forward_iterator iend, - output_iterator obegin - ) - { - to_tensor(ibegin,iend,temp_tensor); - (*this)(temp_tensor, obegin); - } - - const output_label_type& operator() (const input_type& x) - { - (*this)(&x, &x+1, &temp_label); - return temp_label; - } - - template - const output_label_type& process (const input_type& x, T&& ...args) - { - to_tensor(&x,&x+1,temp_tensor); - subnetwork.forward(temp_tensor); - const dimpl::subnet_wrapper wsub(subnetwork); - loss.to_label(temp_tensor, wsub, &temp_label, std::forward(args)...); - return temp_label; - } - - template - std::vector process_batch (const iterable_type& data, size_t batch_size, T&& ...args) - { - std::vector results(std::distance(data.begin(), data.end())); - auto o = results.begin(); - auto i = data.begin(); - auto num_remaining = results.size(); - while(num_remaining != 0) - { - auto inc = std::min(batch_size, num_remaining); - to_tensor(i,i+inc,temp_tensor); - subnetwork.forward(temp_tensor); - const dimpl::subnet_wrapper wsub(subnetwork); - loss.to_label(temp_tensor, wsub, o, std::forward(args)...); - - i += inc; - o += inc; - num_remaining -= inc; - } - return results; - } - - void back_propagate_error( - const tensor& x, - zero_gradients zero_grads = zero_gradients::yes - ) - { - subnet().back_propagate_error(x, zero_grads); - } - - void back_propagate_error( - const tensor& x, - const tensor& gradient_input, - zero_gradients zero_grads = zero_gradients::yes - ) - { - subnet().back_propagate_error(x, gradient_input, zero_grads); - } - - const tensor& get_final_data_gradient( - ) const - { - return subnet().get_final_data_gradient(); - } - - const tensor& forward(const tensor& x) - { - return subnet().forward(x); - } - - template - std::vector operator() ( - const iterable_type& data, - size_t batch_size = 128 - ) - { - std::vector results(std::distance(data.begin(), data.end())); - auto o = results.begin(); - auto i = data.begin(); - auto num_remaining = results.size(); - while(num_remaining != 0) - { - auto inc = std::min(batch_size, num_remaining); - (*this)(i, i+inc, o); - i += inc; - o += inc; - num_remaining -= inc; - } - return results; - } - - template - double compute_loss ( - const tensor& x, - label_iterator lbegin - ) - { - subnetwork.forward(x); - dimpl::subnet_wrapper wsub(subnetwork); - return loss.compute_loss_value_and_gradient(x, lbegin, wsub); - } - - template - double compute_loss ( - forward_iterator ibegin, - forward_iterator iend, - label_iterator lbegin - ) - { - to_tensor(ibegin,iend,temp_tensor); - return compute_loss(temp_tensor, lbegin); - } - - double compute_loss ( - const tensor& x - ) - { - subnetwork.forward(x); - dimpl::subnet_wrapper wsub(subnetwork); - return loss.compute_loss_value_and_gradient(x, wsub); - } - - template - double compute_loss ( - forward_iterator ibegin, - forward_iterator iend - ) - { - to_tensor(ibegin,iend,temp_tensor); - return compute_loss(temp_tensor); - } - - template - double compute_parameter_gradients ( - const tensor& x, - label_iterator lbegin, - zero_gradients zero_grads = zero_gradients::yes - ) - { - subnetwork.forward(x); - dimpl::subnet_wrapper wsub(subnetwork); - double l = loss.compute_loss_value_and_gradient(x, lbegin, wsub); - subnetwork.back_propagate_error(x, zero_grads); - return l; - } - template - double compute_parameter_gradients ( - forward_iterator ibegin, - forward_iterator iend, - label_iterator lbegin, - zero_gradients zero_grads = zero_gradients::yes - ) - { - to_tensor(ibegin,iend,temp_tensor); - return compute_parameter_gradients(temp_tensor, lbegin, zero_grads); - } - double compute_parameter_gradients ( - const tensor& x, - zero_gradients zero_grads = zero_gradients::yes - ) - { - subnetwork.forward(x); - dimpl::subnet_wrapper wsub(subnetwork); - double l = loss.compute_loss_value_and_gradient(x, wsub); - subnetwork.back_propagate_error(x, zero_grads); - return l; - } - template - double compute_parameter_gradients ( - forward_iterator ibegin, - forward_iterator iend, - zero_gradients zero_grads = zero_gradients::yes - ) - { - to_tensor(ibegin,iend,temp_tensor); - return compute_parameter_gradients(temp_tensor, zero_grads); - } - - template - void update_parameters ( - sstack solvers, - double learning_rate - ) - { - subnetwork.update_parameters(solvers, learning_rate); - } - - template - void update_parameters(std::vector& solvers, double learning_rate) - { - update_parameters(make_sstack(solvers), learning_rate); - } - - const subnet_type& subnet() const { return subnetwork; } - subnet_type& subnet() { return subnetwork; } - - const input_layer_type& input_layer() const { return subnet().input_layer(); } - input_layer_type& input_layer() { return subnet().input_layer(); } - - const loss_details_type& loss_details() const { return loss; } - loss_details_type& loss_details() { return loss; } - - void set_gradient_inputs_to_zero ( - ) - { - subnetwork.set_gradient_inputs_to_zero(); - } - - void clean ( - ) - { - temp_tensor.clear(); - subnetwork.clean(); - } - - template - friend void serialize(const add_loss_layer& item, std::ostream& out); - template - friend void deserialize(add_loss_layer& item, std::istream& in); - - friend std::ostream& operator<< (std::ostream& out, const add_loss_layer& item) - { - int min_length = 0; - item.print(out, 0, min_length); - return out; - } - - void print (std::ostream& out, unsigned long idx, int& min_length) const - { - out << "layer<" << idx << ">\t" << loss_details() << "\n"; - subnet().print(out, idx+1, min_length); - } - - private: - - - void swap(add_loss_layer& item) - { - std::swap(loss, item.loss); - std::swap(subnetwork, item.subnetwork); - } - - loss_details_type loss; - subnet_type subnetwork; - - // These two objects don't logically contribute to the state of this object. They - // are here to prevent them from being reallocated over and over. - output_label_type temp_label; - resizable_tensor temp_tensor; - }; - - template - void serialize(const add_loss_layer& item, std::ostream& out) - { - int version = 1; - serialize(version, out); - serialize(item.loss, out); - serialize(item.subnetwork, out); - } - - template - void deserialize(add_loss_layer& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (version != 1) - throw serialization_error("Unexpected version found while deserializing dlib::add_loss_layer."); - deserialize(item.loss, in); - deserialize(item.subnetwork, in); - } - - - template - struct is_loss_layer_type> : std::true_type {}; - -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- - - namespace impl - { - template - struct layer_helper - { - static_assert(i < T::num_layers, "Call to layer() attempted to access non-existing layer in neural network."); - static T& makeT(); - // If you get error here mentioning lack of member "subnet" in "dlib::input<...>", - // then likely your "dlib::layer<...>" invocation wasn't able to find requested layer. - // This could happen for instance when trying to use skip layer for non-existing tag. - using next_type = typename std::remove_reference::type; - using type = typename layer_helper::type; - static type& layer(T& n) - { - return layer_helper::layer(n.subnet()); - } - }; - template < - unsigned int i, - size_t N, template class L, typename S - > - struct layer_helper, typename std::enable_if<(i!=0&&i>=repeat::layers_in_repeated_group)>::type> - { - const static size_t layers_in_repeated_group = repeat::layers_in_repeated_group; - - static repeat& makeT(); - using next_type = typename std::remove_reference::type; - using type = typename layer_helper::type; - static type& layer(repeat& n) - { - return layer_helper::layer(n.subnet()); - } - }; - template < - unsigned int i, - size_t N, template class L, typename S - > - struct layer_helper, typename std::enable_if<(i!=0&&i::layers_in_repeated_group)>::type> - { - const static size_t layers_in_each_group = repeat::layers_in_each_group; - typedef typename repeat::repeated_layer_type repeated_layer_type; - using next_type = repeated_layer_type; - using type = typename layer_helper::type; - static type& layer(repeat& n) - { - return layer_helper::layer(n.get_repeated_layer(i/layers_in_each_group)); - } - }; - template < - size_t N, template class L, typename S - > - struct layer_helper<0,repeat, void> - { - typedef typename repeat::repeated_layer_type repeated_layer_type; - using type = repeated_layer_type; - static type& layer(repeat& n) - { - return n.get_repeated_layer(0); - } - }; - - - - template < - unsigned int i, - size_t N, template class L, typename S - > - struct layer_helper, typename std::enable_if<(i!=0&&i>=repeat::layers_in_repeated_group)>::type> - { - const static size_t layers_in_repeated_group = repeat::layers_in_repeated_group; - - static const repeat& makeT(); - using next_type = const typename std::remove_reference::type; - using type = const typename layer_helper::type; - static type& layer(const repeat& n) - { - return layer_helper::layer(n.subnet()); - } - }; - template < - unsigned int i, - size_t N, template class L, typename S - > - struct layer_helper, typename std::enable_if<(i!=0&&i::layers_in_repeated_group)>::type> - { - const static size_t layers_in_each_group = repeat::layers_in_each_group; - typedef typename repeat::repeated_layer_type repeated_layer_type; - using next_type = const repeated_layer_type; - using type = const typename layer_helper::type; - static type& layer(const repeat& n) - { - return layer_helper::layer(n.get_repeated_layer(i/layers_in_each_group)); - } - }; - template < - size_t N, template class L, typename S - > - struct layer_helper<0,const repeat, void> - { - typedef typename repeat::repeated_layer_type repeated_layer_type; - using type = const repeated_layer_type; - static type& layer(const repeat& n) - { - return n.get_repeated_layer(0); - } - }; - - - - template - struct layer_helper<0,T,void> - { - using type = T; - static type& layer(T& n) - { - return n; - } - }; - - template class Match, typename T, unsigned int i, typename enabled = void> - struct layer_helper_match - { - static T& makeT(); - using next_type = typename std::remove_reference::type; - using type = typename layer_helper_match::type; - static type& layer(T& n) - { - return layer_helper_match::layer(n.subnet()); - } - }; - // This overload catches add_layer and add_loss_layer templates. - template class Match, typename T, unsigned int i> - struct layer_helper_match>::value>::type> - { - using type = typename layer_helper::type; - static type& layer(T& n) - { - return layer_helper::layer(n); - } - }; - // This overload catches input templates. - template class Match, typename T, unsigned int i> - struct layer_helper_match>::value>::type> - { - using type = typename layer_helper::type; - static type& layer(T& n) - { - return layer_helper::layer(n); - } - }; - // This overload catches subnet_wrapper templates. - template class Match, typename T, unsigned int i> - struct layer_helper_match>::value>::type> - { - using type = typename layer_helper::type; - static type& layer(T& n) - { - return layer_helper::layer(n); - } - }; - } - - template - typename impl::layer_helper::type& layer (T& n) - { - return impl::layer_helper::layer(n); - } - - template class Match, typename T> - typename impl::layer_helper_match::type& layer (T& n) - { - return impl::layer_helper_match::layer(n); - } - - template class Match, unsigned int i, typename T> - typename impl::layer_helper_match::type& layer (T& n) - { - return impl::layer_helper_match::layer(n); - } - -// ---------------------------------------------------------------------------------------- - - template - typename net_type::input_layer_type& input_layer ( - net_type& net - ) - { - return net.input_layer(); - } - - template - const typename net_type::input_layer_type& input_layer ( - const net_type& net - ) - { - return net.input_layer(); - } - -// ---------------------------------------------------------------------------------------- - - template class TAG_TYPE, typename SUBNET> - class add_skip_layer - { - public: - typedef SUBNET subnet_type; - typedef typename subnet_type::input_type input_type; - typedef typename subnet_type::input_layer_type input_layer_type; - typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. - const static size_t num_layers = subnet_type::num_layers + 1; - const static size_t num_computational_layers = subnet_type::num_computational_layers; - const static unsigned long id = tag_id::id; - - add_skip_layer() {}; - add_skip_layer(const add_skip_layer&) = default; - add_skip_layer(add_skip_layer&&) = default; - add_skip_layer& operator=(add_skip_layer&&) = default; - add_skip_layer& operator=(const add_skip_layer&) = default; - - template - add_skip_layer( - const add_skip_layer& item - ) : subnetwork(item.subnet()) - {} - - template - add_skip_layer( - T ...args - ) : - subnetwork(std::move(args)...) - { - } - - template - void to_tensor ( - forward_iterator ibegin, - forward_iterator iend, - resizable_tensor& data - ) const - { - subnetwork.to_tensor(ibegin,iend,data); - } - - template - const tensor& operator() ( - forward_iterator ibegin, - forward_iterator iend - ) - { - subnetwork(ibegin,iend); - return layer(subnetwork).get_output(); - } - - const tensor& operator() (const input_type& x) - { - subnetwork(x); - return layer(subnetwork).get_output(); - } - - const tensor& forward(const tensor& x) - { - subnetwork.forward(x); - return layer(subnetwork).get_output(); - } - - const tensor& get_output() const - { - return layer(subnetwork).get_output(); - } - - tensor& get_gradient_input() - { - return layer(subnetwork).get_gradient_input(); - } - - const tensor& get_final_data_gradient( - ) const - { - return subnetwork.get_final_data_gradient(); - } - - void back_propagate_error( - const tensor& x, - zero_gradients zero_grads = zero_gradients::yes - ) - { - subnetwork.back_propagate_error(x, zero_grads); - } - - template - void update_parameters(sstack solvers, double learning_rate) - { - subnetwork.update_parameters(solvers, learning_rate); - } - - template - void update_parameters(std::vector& solvers, double learning_rate) - { - update_parameters(make_sstack(solvers), learning_rate); - } - - const tensor& get_parameter_gradient( - ) const { return params_grad; } - - tensor& get_parameter_gradient ( - ) { return params_grad; } - - - const subnet_type& subnet() const - { - return subnetwork; - } - - subnet_type& subnet() - { - return subnetwork; - } - - const input_layer_type& input_layer() const { return subnet().input_layer(); } - input_layer_type& input_layer() { return subnet().input_layer(); } - - unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } - - void set_gradient_inputs_to_zero() - { - subnetwork.set_gradient_inputs_to_zero(); - } - - void clean() - { - subnetwork.clean(); - } - - friend void serialize(const add_skip_layer& item, std::ostream& out) - { - int version = 1; - serialize(version, out); - serialize(item.subnetwork, out); - } - - friend void deserialize(add_skip_layer& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (version != 1) - throw serialization_error("Unexpected version found while deserializing dlib::add_skip_layer."); - deserialize(item.subnetwork, in); - } - - friend std::ostream& operator<< (std::ostream& out, const add_skip_layer& item) - { - int min_length = 0; - item.print(out, 0, min_length); - return out; - } - - void print (std::ostream& out, unsigned long idx, int& min_length) const - { - out << "layer<" << idx << ">\t"< - friend class add_layer; - template - friend class dimpl::subnet_wrapper; - template - friend class add_tag_layer; - template class T, typename U> - friend class add_skip_layer; - template class L, typename S> - friend class repeat; - - bool this_layer_requires_forward_output( - ) { return layer(subnetwork).this_layer_requires_forward_output(); } - - void disable_output_and_gradient_getters ( - ) { layer(subnetwork).disable_output_and_gradient_getters(); } - - tensor& private_get_output() const - { return layer(subnetwork).private_get_output(); } - tensor& private_get_gradient_input() - { return layer(subnetwork).private_get_gradient_input(); } - - subnet_type subnetwork; - - // This member doesn't logically contribute to the state of the object since it is - // always empty. It's just here so we can have the get_parameter_gradient() methods - // which have to return something. So they return this empty tensor. - resizable_tensor params_grad; - }; - template class T, typename U> - struct is_nonloss_layer_type> : std::true_type {}; - - template using tag1 = add_tag_layer< 1, SUBNET>; - template using tag2 = add_tag_layer< 2, SUBNET>; - template using tag3 = add_tag_layer< 3, SUBNET>; - template using tag4 = add_tag_layer< 4, SUBNET>; - template using tag5 = add_tag_layer< 5, SUBNET>; - template using tag6 = add_tag_layer< 6, SUBNET>; - template using tag7 = add_tag_layer< 7, SUBNET>; - template using tag8 = add_tag_layer< 8, SUBNET>; - template using tag9 = add_tag_layer< 9, SUBNET>; - template using tag10 = add_tag_layer<10, SUBNET>; - - template using skip1 = add_skip_layer< tag1, SUBNET>; - template using skip2 = add_skip_layer< tag2, SUBNET>; - template using skip3 = add_skip_layer< tag3, SUBNET>; - template using skip4 = add_skip_layer< tag4, SUBNET>; - template using skip5 = add_skip_layer< tag5, SUBNET>; - template using skip6 = add_skip_layer< tag6, SUBNET>; - template using skip7 = add_skip_layer< tag7, SUBNET>; - template using skip8 = add_skip_layer< tag8, SUBNET>; - template using skip9 = add_skip_layer< tag9, SUBNET>; - template using skip10 = add_skip_layer; - -// ---------------------------------------------------------------------------------------- - - namespace timpl - { - inline void fill_with_gassuan_random_numbers ( - tensor& t, - dlib::rand& rnd, - double sigma = 1 - ) - { - float* data = t.host(); - for (size_t i = 0; i < t.size(); ++i) - data[i] = rnd.get_random_gaussian()*sigma; - } - - class test_layer_subnet - { - public: - test_layer_subnet ( - dlib::rand& rnd_ - ) : rnd(rnd_) - { - // Output and gradient_input have to have the same dimensions in each - // layer. - const long num_samples = rnd.get_random_32bit_number()%4+3; - const long k = rnd.get_random_32bit_number()%4+2; - const long nr = ((rnd.get_random_32bit_number()%4)/2)*2+2; - const long nc = ((rnd.get_random_32bit_number()%4)/2)*2+2; - - output.set_size(num_samples, k, nr, nc); - gradient_input.set_size(num_samples, k, nr, nc); - - // Use a non-zero initial gradient to make sure the layers add to it - // rather than assign and blow away the initial value. - fill_with_gassuan_random_numbers(gradient_input, rnd, 0.01); - - fill_with_gassuan_random_numbers(output, rnd); - } - - - tensor& get_mutable_output() { return output; } - const tensor& get_output() const { return output; } - const tensor& private_get_output() const { return get_output(); } - const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; } - - tensor& get_gradient_input() { return gradient_input; } - tensor& private_get_gradient_input() { return get_gradient_input(); } - test_layer_subnet& subnet() { init_sub(); return *subnetwork; } - - - - unsigned long count_outputs() const - { - if (subnetwork) - return subnetwork->count_outputs() + output.size(); - else - return output.size(); - } - - float& get_output_element(unsigned long i) - { - if (i < output.size()) - return output.host()[i]; - else - return subnet().get_output_element(i-output.size()); - } - - float get_gradient_input_element(unsigned long i) const - { - if (i < gradient_input.size()) - return gradient_input.host()[i]; - else - return subnet().get_gradient_input_element(i-gradient_input.size()); - } - - - private: - // We lazily initialize sub-layers as needed when someone tries to call - // subnet() - void init_sub() const - { - if (!subnetwork) - subnetwork.reset(new test_layer_subnet(rnd)); - } - - dlib::rand& rnd; - mutable std::unique_ptr subnetwork; - resizable_tensor output; - resizable_tensor gradient_input; - }; - - } - - struct layer_test_results - { - layer_test_results() : was_good(true) {} - explicit layer_test_results(const std::string& l) : log(l),was_good(false) {} - - std::string log; - bool was_good; - - operator bool() const { return was_good; } - }; - - inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item) - { - out << item.log; - return out; - } - - template < - typename layer_details_type - > - layer_test_results impl_test_layer ( - layer_details_type l, - const float base_eps - ) - { - using namespace timpl; - // Do some setup - running_stats rs_data, rs_params; - dlib::rand rnd; - std::ostringstream sout; - for (int iter = 0; iter < 10; ++iter) - { - test_layer_subnet subnetwork(rnd); - resizable_tensor output, out2, out3; - // Run setup() and forward() as well to make sure any calls to subnet() have - // happened before we start assuming we know how many data elements there are - // (since we do a lazy layer creation thing based on calls to subnet() inside - // test_layer_subnet). - l.setup(subnetwork); - impl::call_layer_forward(l, subnetwork, output); - - resizable_tensor input_grad; - input_grad.copy_size(output); - fill_with_gassuan_random_numbers(input_grad, rnd); - - - // The f() we are computing gradients of is this thing. It's value at the current - // parameter and data values is: - //sout << "f(data,params): " << dot(output, input_grad) << std::endl; - - // We are going to save a copy of the subnetwork.get_gradient_input() data before we do - // backpropagation since the backward() function is supposed to *add* to the - // gradients rather than overwrite them. We will use this saved data to check if - // that is the case. - const unsigned long num_data_inputs = subnetwork.count_outputs(); - std::vector initial_gradient_input(num_data_inputs); - for (unsigned long i = 0; i < num_data_inputs; ++i) - initial_gradient_input[i] = subnetwork.get_gradient_input_element(i); - - - // Now tell the layer to compute all the gradients. In the rest of this function - // we will just be checking that these gradients were computed correctly by - // comparing them to a central differences approximation. - resizable_tensor params_grad; - params_grad.copy_size(l.get_layer_params()); - // But first, set the params grad to something crazy so that it's very obvious if - // it doesn't get fully assigned. - params_grad = std::numeric_limits::infinity(); - impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad); - - static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork), - "Layer not defined correctly. forward and backward methods must either both be in-place or both out-of-place. "); - - // Make sure the outputs of forward() and backward() are the same when they are run - // in in-place mode. - if (impl::is_inplace_layer(l, subnetwork)) - { - test_layer_subnet subnetwork2(rnd); - layer_details_type ll(l); - ll.setup(subnetwork2); - resizable_tensor ip_out; - impl::call_layer_forward(ll, subnetwork2, ip_out); - impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output()); - const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output()))); - if (forward_error > 0.00001) - { - sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n"; - sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << std::endl; - return layer_test_results(sout.str()); - } - - resizable_tensor params_grad; - params_grad.copy_size(ll.get_layer_params()); - params_grad = std::numeric_limits::infinity(); - - resizable_tensor input_grad; - input_grad.copy_size(ip_out); - fill_with_gassuan_random_numbers(input_grad, rnd); - resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2; - params_grad1 = params_grad; - params_grad2 = params_grad; - // Now call backward() and make sure it works as well. Recall that when an - // in-place layer works in-place it assigns to it's outputs but when it's - // not running in-place it adds. So we initialize to a non-zero value to - // check that this is the behavior that really executes. - subnetwork2.get_gradient_input() = 9; - impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1); - data_grad1 = subnetwork2.get_gradient_input(); - - subnetwork2.get_gradient_input() = mat(input_grad); - impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2); - data_grad2 = subnetwork2.get_gradient_input(); - if (params_grad.size() != 0) - { - const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2))); - if (backward_param_error > 0.00001) - { - sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n"; - sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << std::endl; - return layer_test_results(sout.str()); - } - } - const auto backward_data_error = max(abs(mat(data_grad1)-9 - mat(data_grad2))); - if (backward_data_error > 0.00001) - { - sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n"; - sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << std::endl; - return layer_test_results(sout.str()); - } - } - - // ================================================================== - // first validate the way the parameter gradients are computed - for (unsigned long i = 0; i < params_grad.size(); ++i) - { - layer_details_type l1(l); - - float eps = l1.get_layer_params().host()[i]*base_eps; - if (eps == 0) - eps = base_eps; - const float oldval = l1.get_layer_params().host()[i]; - l1.get_layer_params().host()[i] = oldval+eps; - impl::call_layer_forward(l1, subnetwork, out2); - l1.get_layer_params().host()[i] = oldval-eps; - impl::call_layer_forward(l1, subnetwork, out3); - l1.get_layer_params().host()[i] = oldval; - - // Compute a reference derivative via a central differences approximation and - // compare it to the one output by the layer and make sure they match. - double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); - double output_derivative = params_grad.host()[i]; - double relative_error; - if (reference_derivative*output_derivative != 0) - relative_error = (reference_derivative - output_derivative)/(reference_derivative); - else - relative_error = (reference_derivative - output_derivative); - double absolute_error = (reference_derivative - output_derivative); - rs_params.add(std::abs(relative_error)); - if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006) - { - sout << "Gradient error in parameter #" << i <<". Relative error: "<< relative_error << std::endl; - sout << "expected derivative: " << reference_derivative << std::endl; - sout << "output derivative: " << output_derivative << std::endl; - sout << "iteration: " << iter << std::endl; - return layer_test_results(sout.str()); - } - } - - // ================================================================== - // now validate the data gradients - for (unsigned long i = 0; i < num_data_inputs; ++i) - { - const float oldval = subnetwork.get_output_element(i); - float eps = oldval*base_eps; - if (eps == 0) - eps = base_eps; - subnetwork.get_output_element(i) = oldval+eps; - impl::call_layer_forward(l, subnetwork, out2); - subnetwork.get_output_element(i) = oldval-eps; - impl::call_layer_forward(l, subnetwork, out3); - subnetwork.get_output_element(i) = oldval; - - // Compute a reference derivative via a central differences approximation and - // compare it to the one output by the layer and make sure they match. - double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); - double output_derivative = subnetwork.get_gradient_input_element(i); - output_derivative -= initial_gradient_input[i]; - double relative_error; - if (reference_derivative*output_derivative != 0) - relative_error = (reference_derivative - output_derivative)/(reference_derivative); - else - relative_error = (reference_derivative - output_derivative); - double absolute_error = (reference_derivative - output_derivative); - rs_data.add(std::abs(relative_error)); - if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006) - { - sout << "Gradient error in data variable #" << i <<". Relative error: "<< relative_error << std::endl; - sout << "expected derivative: " << reference_derivative << std::endl; - sout << "output derivative: " << output_derivative << std::endl; - sout << "iteration: " << iter << std::endl; - return layer_test_results(sout.str()); - } - } - - } // end for (int iter = 0; iter < 10; ++iter) - - if (rs_params.mean() > 0.003) - { - sout << "Average parameter gradient error is somewhat large at: "<< rs_params.mean() << std::endl; - return layer_test_results(sout.str()); - } - if (rs_data.mean() > 0.003) - { - sout << "Average data gradient error is somewhat large at: "<< rs_data.mean() << std::endl; - return layer_test_results(sout.str()); - } - - return layer_test_results(); - } - - template < - typename layer_details_type - > - layer_test_results test_layer ( - layer_details_type l - ) - { - // Try a few different derivative step sizes to see if any work. - for (float base_eps = 0.0001; base_eps < 0.1; base_eps *= 2) - { - auto result = impl_test_layer(l, base_eps); - if (result) - return result; - } - // However, if none of the step sizes worked then try this one and probably result - // in returning an error. - return impl_test_layer(l, 0.01); - } - -// ---------------------------------------------------------------------------------------- - - namespace impl - { - template - struct vl_loop - { - template < - typename net_type, - typename visitor - > - static void visit( - net_type& net, - visitor&& v - ) - { - // Call whatever version of the visitor the user provided. - call_if_valid(v, i, layer(net)); - call_if_valid(v, layer(net)); - vl_loop::visit(net,v); - } - }; - - template - struct vl_loop - { - template < - typename net_type, - typename visitor - > - static void visit( - net_type&, - visitor&& - ) - { - // Base case of recursion. Don't do anything. - } - }; - - template - struct vl_loop_backwards - { - template < - typename net_type, - typename visitor - > - static void visit( - net_type& net, - visitor&& v - ) - { - vl_loop_backwards::visit(net,v); - // Call whatever version of the visitor the user provided. - call_if_valid(v, i, layer(net)); - call_if_valid(v, layer(net)); - } - }; - - template - struct vl_loop_backwards - { - template < - typename net_type, - typename visitor - > - static void visit( - net_type&, - visitor&& - ) - { - // Base case of recursion. Don't do anything. - } - }; - - } - - template < - typename net_type, - typename visitor - > - void visit_layers( - net_type& net, - visitor v - ) - { - impl::vl_loop<0, net_type::num_layers>::visit(net, v); - } - - template < - typename net_type, - typename visitor - > - void visit_layers_backwards( - net_type& net, - visitor v - ) - { - impl::vl_loop_backwards<0, net_type::num_layers>::visit(net, v); - } - - template < - size_t begin, - size_t end, - typename net_type, - typename visitor - > - void visit_layers_range( - net_type& net, - visitor v - ) - { - static_assert(begin <= end, "Invalid range"); - static_assert(end <= net_type::num_layers, "Invalid range"); - impl::vl_loop::visit(net, v); - } - - template < - size_t begin, - size_t end, - typename net_type, - typename visitor - > - void visit_layers_backwards_range( - net_type& net, - visitor v - ) - { - static_assert(begin <= end, "Invalid range"); - static_assert(end <= net_type::num_layers, "Invalid range"); - impl::vl_loop_backwards::visit(net, v); - } - -// ---------------------------------------------------------------------------------------- - - namespace impl - { - template - struct vl_until_tag - { - template < - typename net_type, - typename next_net_type, - typename visitor - > - static void visit( - net_type& net, - next_net_type& next_net, - visitor&& v - ) - { - call_if_valid(v, next_net); - vl_until_tag::visit(net,layer(net),v); - } - - template < - typename net_type, - typename SUBNET, - typename visitor - > - static void visit( - net_type&, - const add_tag_layer& next_net, - visitor&& v - ) - { - call_if_valid(v, next_net); - } - - template < - typename net_type, - typename SUBNET, - typename visitor - > - static void visit( - net_type&, - add_tag_layer& next_net, - visitor&& v - ) - { - call_if_valid(v, next_net); - } - }; - } - - template < - unsigned long tag_id, - typename net_type, - typename visitor - > - void visit_layers_until_tag( - net_type& net, - visitor v - ) - { - impl::vl_until_tag<0,tag_id>::visit(net, net, v); - } - -// ---------------------------------------------------------------------------------------- - - namespace impl - { - template < - typename visitor - > - class visitor_computational_layer - { - public: - explicit visitor_computational_layer(visitor& v) : v_(v) {} - - template - void do_visit(size_t idx, layer& l) const - { - // Call whatever version of the visitor the user provided. - call_if_valid(v_, idx, l.layer_details()); - call_if_valid(v_, l.layer_details()); - } - - // const case - template - void operator()(size_t idx, const add_layer& l) const { do_visit(idx, l); } - // non-const cast - template - void operator()(size_t idx, add_layer& l) const { do_visit(idx, l); } - - private: - - visitor& v_; - }; - } - - template < - typename net_type, - typename visitor - > - void visit_computational_layers( - net_type& net, - visitor v - ) - { - visit_layers(net, impl::visitor_computational_layer(v)); - } - - template < - size_t begin, - size_t end, - typename net_type, - typename visitor - > - void visit_computational_layers_range( - net_type& net, - visitor v - ) - { - visit_layers_range(net, impl::visitor_computational_layer(v)); - } - -// ---------------------------------------------------------------------------------------- - - namespace impl - { - template < - typename visitor - > - class visit_layer_parameters - { - public: - explicit visit_layer_parameters(visitor& v) : v_(v) {} - - template - void operator()(layer& l) - { - // Call whatever version of the visitor the user provided. - const bool visitor_called = call_if_valid(v_, computational_layer_idx, l.get_layer_params()) || - call_if_valid(v_, l.get_layer_params()); - DLIB_CASSERT(visitor_called, "A visitor function with an incorrect signature was given to visit_layer_parameters()"); - ++computational_layer_idx; - } - private: - - size_t computational_layer_idx = 0; - visitor& v_; - }; - } - - template < - typename net_type, - typename visitor - > - void visit_layer_parameters( - net_type& net, - visitor v - ) - { - visit_computational_layers(net, impl::visit_layer_parameters(v)); - } - -// ---------------------------------------------------------------------------------------- - - namespace impl - { - template < - typename visitor - > - class visit_layer_parameter_gradients - { - public: - explicit visit_layer_parameter_gradients(visitor& v) : v_(v) {} - - template - void do_visit(layer& l) - { - // Call whatever version of the visitor the user provided. - const bool visitor_called = call_if_valid(v_, computational_layer_idx, l.get_parameter_gradient()) || - call_if_valid(v_, l.get_parameter_gradient()); - DLIB_CASSERT(visitor_called, "A visitor function with an incorrect signature was given to visit_layer_parameter_gradients()"); - ++computational_layer_idx; - } - - // const version - template - void operator()(const add_layer& l) { do_visit(l); } - // non-const version - template - void operator()(add_layer& l) { do_visit(l); } - - private: - - size_t computational_layer_idx = 0; - visitor& v_; - }; - } - - template < - typename net_type, - typename visitor - > - void visit_layer_parameter_gradients( - net_type& net, - visitor v - ) - { - visit_layers(net, impl::visit_layer_parameter_gradients(v)); - } - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_DNn_CORE_H_ - - +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_CORE_H_ +#define DLIB_DNn_CORE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core_abstract.h" +#include "../cuda/tensor.h" +#include "../cuda/tensor_tools.h" +#include "../statistics.h" +#include "../rand.h" +#include "../algs.h" +#include "../metaprogramming.h" +#include "../utility.h" +#include "../constexpr_if.h" + +#ifdef _MSC_VER +// Tell Visual Studio not to recursively inline functions very much because otherwise it +// takes hours to compile the DNN code sometimes. It's crazy. Hopefully we can remove +// this some day when the visual studio compiler is more efficient. +#pragma inline_depth(2) +#endif + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template + using has_get_learning_rate_multiplier = decltype(std::declval().get_learning_rate_multiplier()); + + template + using has_set_learning_rate_multiplier = decltype(std::declval().set_learning_rate_multiplier(double{})); + + template + using has_get_bias_learning_rate_multiplier = decltype(std::declval().get_bias_learning_rate_multiplier()); + + template + using has_set_bias_learning_rate_multiplier = decltype(std::declval().set_bias_learning_rate_multiplier(double{})); + + template + using has_get_weight_decay_multiplier = decltype(std::declval().get_weight_decay_multiplier()); + + template + using has_set_weight_decay_multiplier = decltype(std::declval().set_weight_decay_multiplier(double{})); + + template + using has_get_bias_weight_decay_multiplier = decltype(std::declval().get_bias_weight_decay_multiplier()); + + template + using has_set_bias_weight_decay_multiplier = decltype(std::declval().set_bias_weight_decay_multiplier(double{})); + + template + using has_disable_bias = decltype(std::declval().disable_bias()); + + template + using has_clean = decltype(std::declval().clean()); + } + +// ---------------------------------------------------------------------------------------- + + template + double get_learning_rate_multiplier(const T& obj) + { + return switch_(bools(is_detected{}), + [&](true_t, auto _) { return _(obj).get_learning_rate_multiplier(); }, + [](auto...) { return 1.0; } + ); + } + + template + void set_learning_rate_multiplier( + T& obj, + double learning_rate_multiplier + ) + { + DLIB_CASSERT(learning_rate_multiplier >= 0); + switch_(bools(is_detected{}), + [&](true_t, auto _) { _(obj).set_learning_rate_multiplier(learning_rate_multiplier); }, + [](auto...) {/*no-op*/} + ); + } + +// ---------------------------------------------------------------------------------------- + + template + double get_bias_learning_rate_multiplier(const T& obj) + { + return switch_(bools(is_detected{}), + [&](true_t, auto _) { return _(obj).get_bias_learning_rate_multiplier(); }, + [](auto...) { return 1.0; } + ); + } + + template + void set_bias_learning_rate_multiplier( + T& obj, + double bias_learning_rate_multiplier + ) + { + DLIB_CASSERT(bias_learning_rate_multiplier >= 0); + switch_(bools(is_detected{}), + [&](true_t, auto _) { _(obj).set_bias_learning_rate_multiplier(bias_learning_rate_multiplier); }, + [](auto...) {/*no-op*/} + ); + } + +// ---------------------------------------------------------------------------------------- + + template + double get_weight_decay_multiplier(const T& obj) + { + return switch_(bools(is_detected{}), + [&](true_t, auto _) { return _(obj).get_weight_decay_multiplier(); }, + [](auto...) { return 1.0; } + ); + } + + template + void set_weight_decay_multiplier( + T& obj, + double weight_decay_multiplier + ) + { + DLIB_CASSERT(weight_decay_multiplier >= 0); + switch_(bools(is_detected{}), + [&](true_t, auto _) { _(obj).set_weight_decay_multiplier(weight_decay_multiplier); }, + [](auto...) {/*no-op*/} + ); + } + +// ---------------------------------------------------------------------------------------- + + template + double get_bias_weight_decay_multiplier(const T& obj) + { + return switch_(bools(is_detected{}), + [&](true_t, auto _) { return _(obj).get_bias_weight_decay_multiplier(); }, + [](auto...) { return 1.0; } + ); + } + + template + void set_bias_weight_decay_multiplier( + T& obj, + double bias_weight_decay_multiplier + ) + { + DLIB_CASSERT(bias_weight_decay_multiplier >= 0); + switch_(bools(is_detected{}), + [&](true_t, auto _) { _(obj).set_bias_weight_decay_multiplier(bias_weight_decay_multiplier); }, + [](auto...) {/*no-op*/} + ); + } + +// ---------------------------------------------------------------------------------------- + + template + void disable_bias( + T& obj + ) + { + switch_(bools(is_detected{}), + [&](true_t, auto _) { _(obj).disable_bias(); }, + [](auto...) { /*no-op*/ } + ); + } + +// ---------------------------------------------------------------------------------------- + + template + void call_clean_method_if_exists(T& obj) + /*! + ensures + - calls obj.clean() if obj has a .clean() method. + !*/ + { + switch_(bools(is_detected{}), + [&](true_t, auto _) { _(obj).clean(); }, + [](auto...) { /*no-op*/ } + ); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + class repeat_input_layer + { + /*! + None of the declarations in this object are really used. The only reason it + exists is to allow the repeat object to use a special input layer in its + internal networks which will cause add_tag_layer objects that happen to be + right at the input to not create copies of their input tensors. So + introducing the repeat_input_layer object allows us to optimize the + implementation of add_tag_layer for a special case that arises when it's + used in the context of the repeat layer. + !*/ + public: + typedef int input_type; + + template + void to_tensor ( + forward_iterator , + forward_iterator , + resizable_tensor& + ) const + { + } + + friend void serialize(const repeat_input_layer&, std::ostream&){} + friend void deserialize(repeat_input_layer&, std::istream&){} + friend std::ostream& operator<<(std::ostream& out, const repeat_input_layer&) { return out; } + }; + + inline std::string tensor_to_str ( + const tensor& t, + int& min_length + ) + { + if (t.size() == 0) + return ""; + + std::ostringstream sout; + sout << "output size=(num:"<< t.num_samples() << ", "; + sout << "k:" << t.k() << ","; + while (sout.tellp() < 28) sout << " "; + sout << "nr:" << t.nr() << ","; + while (sout.tellp() < 28+8) sout << " "; + sout << "nc:" << t.nc() << ")"; + while (sout.tellp() < min_length) sout << " "; + min_length = sout.tellp(); + sout << "\t"; + return sout.str(); + } + } + +// ---------------------------------------------------------------------------------------- + + // Tell us if T is one of the special layer types (i.e. add_layer, repeat, add_tag_layer, or + // add_skip_layer). + template struct is_nonloss_layer_type : std::false_type {}; + // Tell us if T is an instance of add_loss_layer. + template struct is_loss_layer_type : std::false_type {}; + // Tell us if T is an instance of add_layer + template struct is_add_layer : std::false_type {}; + + namespace impl + { + template + auto tuple_subset( + const Tuple& item, + std::index_sequence + ) + { + return std::make_tuple(std::get(item)...); + } + + template + auto basic_tuple_tail( + const std::tuple& item + ) + { + return tuple_subset(item, pop_front_t>{}); + } + + template + auto tuple_flatten(const T& t) + { + return std::make_tuple(t); + } + + template + auto tuple_flatten( + const std::tuple& item, + std::index_sequence + ) + { + return std::tuple_cat(tuple_flatten(std::get(item))...); + } + + template + auto tuple_flatten(const std::tuple& item) + { + return tuple_flatten(item, std::index_sequence_for{}); + } + + template + struct tuple_head_helper + { + typedef T type; + static const type& get(const T& item) + { + return item; + } + }; + + template + struct tuple_head_helper> + { + typedef typename tuple_head_helper::type type; + static const type& get(const std::tuple& item) + { + return tuple_head_helper::get(std::get<0>(item)); + } + }; + + template struct alwaysbool { typedef bool type; }; + // one more structure for VS 2015 UP3 support workaround + template struct alwaysbool2 { typedef bool type; }; + + resizable_tensor& rt(); + + // The significance of a layer's backward method requiring forward's outputs is + // that such as layer can't have an in-place layer stacked on top of it because + // in-place layers overwrite the output of the layer they sit on top of. + template + constexpr auto backward_requires_forward_output( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool::type + { + return true; + } + + template + constexpr auto backward_requires_forward_output( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool::type + { + return false; + } + + template + constexpr auto backward_requires_forward_output( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool::type + { + return true; + } + + template + constexpr auto backward_requires_forward_output( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool::type + { + return false; + } + + template + constexpr auto has_inplace_backward( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool2::type + { + return false; + } + + template + constexpr auto has_inplace_backward( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool2::type + { + return false; + } + + template + constexpr auto has_inplace_backward( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool2::type + { + return true; + } + + template + constexpr auto has_inplace_backward( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool2::type + { + return true; + } + + template + constexpr auto is_inplace_layer( + layer_type& layer, + const SUBNET& sub + ) -> typename alwaysbool2::type + { + return false; + } + + template + constexpr auto is_inplace_layer( + layer_type& layer, + const SUBNET& sub + ) -> typename alwaysbool::type + { + return true; + } + + template + auto call_layer_backward( + layer_type& layer, + const tensor& computed_output, + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad)) + { + layer.backward(computed_output,gradient_input,sub,params_grad); + } + + template + auto call_layer_backward( + layer_type& layer, + const tensor& , + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) -> decltype(layer.backward(gradient_input,sub,params_grad)) + { + layer.backward(gradient_input,sub,params_grad); + } + + template + auto call_layer_backward( + layer_type& layer, + const tensor& computed_output, + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad)) + { + layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad); + } + + template + auto call_layer_backward( + layer_type& layer, + const tensor& , + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) -> decltype(layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad)) + { + layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad); + } + + + template + auto call_layer_forward( + layer_type& layer, + const SUBNET& sub, + tensor& /*data_output*/ + ) -> decltype(layer.forward(sub,rt())) + { + // This overload of call_layer_forward() is here because this template + // naturally gets instantiated but only on code paths that never get executed. + // So rather than writing a bunch of hard to read template magic around call + // sites we just have this overload that doesn't do anything (and an assert to + // make sure that's the case). + DLIB_CASSERT(false, "This should never happen"); + } + + template + auto call_layer_forward( + layer_type& layer, + const SUBNET& sub, + resizable_tensor& data_output + ) -> decltype(layer.forward(sub,data_output)) + { + layer.forward(sub,data_output); + } + + template + auto call_layer_forward( + layer_type& layer, + const SUBNET& sub, + tensor& data_output + ) -> decltype(layer.forward_inplace(sub.get_output(),data_output)) + { + layer.forward_inplace(sub.get_output(),data_output); + } + + template + auto call_layer_forward( + layer_type& layer, + const SUBNET& sub, + resizable_tensor& data_output + ) -> decltype(layer.forward_inplace(sub.get_output(),data_output)) + { + if (!have_same_dimensions(data_output, sub.get_output())) + data_output.copy_size(sub.get_output()); + layer.forward_inplace(sub.get_output(),static_cast(data_output)); + } + + + } // end namespace impl + + template + auto tuple_head ( + const std::tuple& item + ) + { + return impl::tuple_head_helper>::get(item); + } + + template + auto tuple_tail( + const std::tuple& item + ) + { + return impl::basic_tuple_tail(impl::tuple_flatten(item)); + } + + inline std::tuple<> tuple_tail( + const std::tuple<>& item + ) + { + return item; + } +// ---------------------------------------------------------------------------------------- + + template + class sstack + { + public: + typedef T value_type; + + sstack() = delete; + + sstack ( + T* data_, + size_t s + ) : data(data_), mysize(s) {} + + const T& top() const + { + DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack"); + return *data; + } + T& top() + { + DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack"); + return *data; + } + + size_t size() const { return mysize; } + + sstack pop(size_t num=1) + { + DLIB_CASSERT(num <= size(), "You can't pop more things from the stack than it has in it."); + return sstack(data+num, mysize-num); + } + + private: + + T* data; + size_t mysize; + }; + + template + sstack make_sstack(std::vector& item) + { + return sstack(item.data(), item.size()); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + namespace dimpl + { + template + class subnet_wrapper + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a tool that makes an add_layer or add_loss_layer object + expose only the part of its interface defined by the SUBNET + type in layers_abstract.h. This way, when we pass subnetwork + objects to the layer callbacks those callbacks won't be able to + interact with the subnetworks in a way other than specified + by the SUBNET interface spec. + + We also allow the top layer of a subnet_wrapper stack to call the + private_get_output() and private_get_gradient_input() functions. This + way, layers that have had their output/gradient overwritten by in-place + layers can only be accessed from the in-place layers that sit directly + on top of them since those in-place layers are the only layers that + know how to interact with them properly. + !*/ + + public: + subnet_wrapper(const subnet_wrapper&) = delete; + subnet_wrapper& operator=(const subnet_wrapper&) = delete; + + subnet_wrapper(T& l_, unsigned int sef) : l(l_),_sample_expansion_factor(sef) {} + // Not much here because in this case T is one of the input layer types + // that doesn't have anything in it. + typedef T layer_details_type; + typedef T input_layer_type; + const layer_details_type& layer_details() const { return l; } + const input_layer_type& input_layer() const { return l; } + input_layer_type& input_layer() { return l; } + unsigned int sample_expansion_factor() const { return _sample_expansion_factor; } + private: + T& l; + unsigned int _sample_expansion_factor; + }; + + template + class subnet_wrapper::value>::type> + { + + public: + subnet_wrapper(const subnet_wrapper&) = delete; + subnet_wrapper& operator=(const subnet_wrapper&) = delete; + + typedef T wrapped_type; + const static size_t num_computational_layers = T::num_computational_layers; + const static size_t num_layers = T::num_layers; + typedef typename T::layer_details_type layer_details_type; + typedef typename T::input_layer_type input_layer_type; + + subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {} + + const tensor& get_output() const { return l.private_get_output(); } + tensor& get_gradient_input() { return l.private_get_gradient_input(); } + + const layer_details_type& layer_details() const { return l.layer_details(); } + + const subnet_wrapper& subnet() const { return subnetwork; } + subnet_wrapper& subnet() { return subnetwork; } + unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); } + + const input_layer_type& input_layer() const { return l.input_layer(); } + input_layer_type& input_layer() { return l.input_layer(); } + + private: + T& l; + subnet_wrapper subnetwork; + }; + + template + class subnet_wrapper::value>::type> + { + + public: + subnet_wrapper(const subnet_wrapper&) = delete; + subnet_wrapper& operator=(const subnet_wrapper&) = delete; + + typedef T wrapped_type; + const static size_t num_computational_layers = T::num_computational_layers; + const static size_t num_layers = T::num_layers; + typedef typename T::layer_details_type layer_details_type; + typedef typename T::input_layer_type input_layer_type; + + subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {} + + const tensor& get_output() const { return l.get_output(); } + tensor& get_gradient_input() { return l.get_gradient_input(); } + + const layer_details_type& layer_details() const { return l.layer_details(); } + + const subnet_wrapper& subnet() const { return subnetwork; } + subnet_wrapper& subnet() { return subnetwork; } + unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); } + + const input_layer_type& input_layer() const { return l.input_layer(); } + input_layer_type& input_layer() { return l.input_layer(); } + + private: + T& l; + subnet_wrapper subnetwork; + }; + } + +// ---------------------------------------------------------------------------------------- + + enum class zero_gradients : uint8_t + { + no = 0, + yes = 1 + }; + +// ---------------------------------------------------------------------------------------- + + template + class add_layer; + + template + void serialize(const add_layer& item, std::ostream& out); + template + void deserialize(add_layer& item, std::istream& in); + + template + struct is_nonloss_layer_type> : std::true_type {}; + + template + class add_layer::value>::type> + { + public: + typedef LAYER_DETAILS layer_details_type; + typedef SUBNET subnet_type; + typedef typename subnet_type::input_layer_type input_layer_type; + typedef typename subnet_type::input_type input_type; + const static size_t num_layers = subnet_type::num_layers + 1; + const static size_t num_computational_layers = subnet_type::num_computational_layers + 1; + + add_layer( + ): + subnetwork(new subnet_type()), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + add_layer(const add_layer& item) + { + details = item.details; + subnetwork.reset(new subnet_type(*item.subnetwork)); + this_layer_setup_called = item.this_layer_setup_called; + gradient_input_is_stale = item.gradient_input_is_stale; + get_output_and_gradient_input_disabled = item.get_output_and_gradient_input_disabled; + x_grad = item.x_grad; + cached_output = item.cached_output; + params_grad = item.params_grad; + temp_tensor = item.temp_tensor; + } + add_layer& operator=(const add_layer& item) { add_layer(item).swap(*this); return *this;} + add_layer(add_layer&& item) : add_layer() { swap(item); } + add_layer& operator=(add_layer&& item) { swap(item); return *this; } + + template + friend class add_layer; + template + friend class dimpl::subnet_wrapper; + template + friend class add_tag_layer; + template class T, typename U> + friend class add_skip_layer; + template class L, typename S> + friend class repeat; + + // Allow copying networks from one to another as long as their corresponding + // layers can be constructed from each other. + template + add_layer( + const add_layer& item + ) : + details(item.layer_details()), + subnetwork(new subnet_type(item.subnet())), + this_layer_setup_called(item.this_layer_setup_called), + gradient_input_is_stale(item.gradient_input_is_stale), + get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled), + x_grad(item.x_grad), + cached_output(item.cached_output) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template + add_layer( + const LAYER_DETAILS& layer_det, + T&& ...args + ) : + details(layer_det), + subnetwork(new subnet_type(std::forward(args)...)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template + struct disable_forwarding_constr + { + const static bool value = std::is_constructible::value; + }; + template + struct disable_forwarding_constr,U...> + { + const static bool value = disable_forwarding_constr::type...>::value; + }; + template + struct disable_forwarding_constr,U...> + { + const static bool value = disable_forwarding_constr::type>::value; + }; + template + struct disable_forwarding_constr,U...> + { + const static bool value = true; + }; + template + struct disable_forwarding_constr> + { + const static bool value = true; + }; + + template < + typename ...T, + typename = typename std::enable_if::type...>::value>::type + > + add_layer( + T&& ...args + ) : + subnetwork(new subnet_type(std::forward(args)...)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template + add_layer( + LAYER_DETAILS&& layer_det, + T&& ...args + ) : + details(std::move(layer_det)), + subnetwork(new subnet_type(std::forward(args)...)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template + add_layer( + const std::tuple& layer_det, + T&& ...args + ) : + details(tuple_head(layer_det)), + subnetwork(new subnet_type(tuple_tail(layer_det),std::forward(args)...)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template + add_layer( + std::tuple<>, + const std::tuple& layer_det, + T&& ...args + ) : add_layer(layer_det,args...) { } + + add_layer ( + std::tuple<> + ) : add_layer() {} + + template + add_layer( + std::tuple<>, + LAYER_DETAILS&& layer_det, + T&& ...args + ) : add_layer(layer_det, args...) { } + + template + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork->to_tensor(ibegin,iend,data); + } + + template + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + } + + + const tensor& operator() (const input_type& x) + { + return (*this)(&x, &x+1); + } + + const tensor& forward(const tensor& x) + { + subnetwork->forward(x); + const dimpl::subnet_wrapper wsub(*subnetwork); + if (!this_layer_setup_called) + { + details.setup(wsub); + this_layer_setup_called = true; + } + if (this_layer_operates_inplace()) + impl::call_layer_forward(details, wsub, private_get_output()); + else + impl::call_layer_forward(details, wsub, cached_output); + + gradient_input_is_stale = true; + return private_get_output(); + } + + private: + tensor& private_get_output() const + { + if (const_cast(*this).this_layer_operates_inplace()) + return subnetwork->private_get_output(); + else + return const_cast(cached_output); + } + tensor& private_get_gradient_input() + { + if (this_layer_operates_inplace()) + { + return subnetwork->private_get_gradient_input(); + } + else + { + if (gradient_input_is_stale) + { + gradient_input_is_stale = false; + x_grad.copy_size(private_get_output()); + x_grad = 0; + } + return x_grad; + } + } + void disable_output_and_gradient_getters ( + ) { get_output_and_gradient_input_disabled = true; } + public: + const tensor& get_output() const + { + if (get_output_and_gradient_input_disabled) + throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it."); + return private_get_output(); + } + tensor& get_gradient_input() + { + if (get_output_and_gradient_input_disabled) + throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it."); + return private_get_gradient_input(); + } + + const tensor& get_final_data_gradient( + ) const { return subnetwork->get_final_data_gradient(); } + + void back_propagate_error( + const tensor& x, + zero_gradients zero_grads = zero_gradients::yes + ) + { + back_propagate_error(x, private_get_gradient_input(), zero_grads); + } + void back_propagate_error( + const tensor& x, + const tensor& gradient_input, + zero_gradients zero_grads = zero_gradients::yes + ) + { + dimpl::subnet_wrapper wsub(*subnetwork); + params_grad.copy_size(details.get_layer_params()); + impl::call_layer_backward(details, private_get_output(), + gradient_input, wsub, static_cast(params_grad)); + + subnetwork->back_propagate_error(x, zero_grads); + + // zero out get_gradient_input() + gradient_input_is_stale = zero_grads == zero_gradients::yes; + } + + template + void update_parameters(sstack solvers, double learning_rate) + { + DLIB_CASSERT(solvers.size()>=num_computational_layers); + // Don't try to adjust the parameters if this layer doesn't have any or the + // learning rate is disabled for this layer. + if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0) + { + const tensor& step = solvers.top()(learning_rate, details, static_cast(params_grad)); + tt::add(details.get_layer_params(), details.get_layer_params(), step); + } + subnetwork->update_parameters(solvers.pop(), learning_rate); + } + + template + void update_parameters(std::vector& solvers, double learning_rate) + { + update_parameters(make_sstack(solvers), learning_rate); + } + + const tensor& get_parameter_gradient( + ) const { return params_grad; } + + tensor& get_parameter_gradient ( + ) { return params_grad; } + + const subnet_type& subnet() const { return *subnetwork; } + subnet_type& subnet() { return *subnetwork; } + + const input_layer_type& input_layer() const { return subnet().input_layer(); } + input_layer_type& input_layer() { return subnet().input_layer(); } + + const layer_details_type& layer_details() const { return details; } + layer_details_type& layer_details() { return details; } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + void set_gradient_inputs_to_zero() + { + gradient_input_is_stale = true; + subnetwork->set_gradient_inputs_to_zero(); + } + + void clean() + { + x_grad.clear(); + cached_output.clear(); + params_grad.clear(); + temp_tensor.clear(); + gradient_input_is_stale = true; + subnetwork->clean(); + call_clean_method_if_exists(details); + } + + friend void serialize(const add_layer& item, std::ostream& out) + { + int version = 2; + serialize(version, out); + serialize(*item.subnetwork, out); + serialize(item.details, out); + serialize(item.this_layer_setup_called, out); + serialize(item.gradient_input_is_stale, out); + serialize(item.get_output_and_gradient_input_disabled, out); + serialize(item.x_grad, out); + serialize(item.cached_output, out); + serialize(item.params_grad, out); + } + + friend void deserialize(add_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (!(1 <= version && version <= 2)) + throw serialization_error("Unexpected version found while deserializing dlib::add_layer."); + deserialize(*item.subnetwork, in); + deserialize(item.details, in); + deserialize(item.this_layer_setup_called, in); + deserialize(item.gradient_input_is_stale, in); + deserialize(item.get_output_and_gradient_input_disabled, in); + deserialize(item.x_grad, in); + deserialize(item.cached_output, in); + if (version == 2) + deserialize(item.params_grad, in); + } + + friend std::ostream& operator<< (std::ostream& out, const add_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n"; + subnet().print(out, idx+1, min_length); + } + + private: + + bool this_layer_operates_inplace( + ) + { + // This layer can run in-place if it's an in-place capable layer and also if + // the layer it's on top of doesn't need its own output tensor (since in-place + // layers overwrite that tensor) + return impl::is_inplace_layer(details, *subnetwork) && !subnetwork->this_layer_requires_forward_output(); + } + bool this_layer_requires_forward_output( + ) + { + return impl::backward_requires_forward_output(details, *subnetwork); + } + + void swap(add_layer& item) + { + std::swap(subnetwork,item.subnetwork); + std::swap(details, item.details); + std::swap(this_layer_setup_called, item.this_layer_setup_called); + std::swap(gradient_input_is_stale, item.gradient_input_is_stale); + std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled); + std::swap(x_grad, item.x_grad); + std::swap(cached_output, item.cached_output); + std::swap(params_grad, item.params_grad); + } + + + LAYER_DETAILS details; + std::unique_ptr subnetwork; + bool this_layer_setup_called; + bool gradient_input_is_stale; + bool get_output_and_gradient_input_disabled; + // Note that if this_layer_operates_inplace()==true then x_grad and cached_output + // are not used at all. Instead, this layer uses these variables from the lower + // layer. + resizable_tensor x_grad; + resizable_tensor cached_output; + + resizable_tensor params_grad; + + // temp_tensor doesn't logically contribute to the state of this object. + // It is here only to prevent it from being reallocated over and over. + resizable_tensor temp_tensor; + + }; + + template + struct is_add_layer> : std::true_type {}; + template + struct is_add_layer> : std::true_type {}; + template + struct is_add_layer&> : std::true_type {}; + template + struct is_add_layer&> : std::true_type {}; + +// ---------------------------------------------------------------------------------------- + +// This version of add_layer handles the special case where the subnetwork being given is +// just an input layer object. + template + class add_layer + { + public: + typedef LAYER_DETAILS layer_details_type; + typedef INPUT_LAYER subnet_type; + typedef INPUT_LAYER input_layer_type; + typedef typename INPUT_LAYER::input_type input_type; + const static size_t num_layers = 2; + const static size_t num_computational_layers = 1; + + add_layer( + ): + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer(const add_layer&) = default; + add_layer(add_layer&& item) : add_layer() { swap(item); } + add_layer& operator=(const add_layer&) = default; + add_layer& operator=(add_layer&& item) { swap(item); return *this; } + + template + friend class add_layer; + template + friend class dimpl::subnet_wrapper; + template + friend class add_tag_layer; + template class T, typename U> + friend class add_skip_layer; + template class L, typename S> + friend class repeat; + + // Allow copying networks from one to another as long as their corresponding + // layers can be constructed from each other. + template + add_layer( + const add_layer& item + ): + input_layer_(item.subnet()), + details(item.layer_details()), + this_layer_setup_called(item.this_layer_setup_called), + gradient_input_is_stale(item.gradient_input_is_stale), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(item._sample_expansion_factor), + x_grad(item.x_grad), + cached_output(item.cached_output), + grad_final(item.grad_final) + { + } + + add_layer( + const LAYER_DETAILS& layer_det + ) : + details(layer_det), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer( + const INPUT_LAYER& il + ) : + input_layer_(il), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer( + LAYER_DETAILS&& layer_det + ) : + details(std::move(layer_det)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer( + LAYER_DETAILS layer_det, + INPUT_LAYER il + ) : + details(std::move(layer_det)), + input_layer_(std::move(il)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer( + std::tuple<>, + const LAYER_DETAILS& layer_det + ) : add_layer(layer_det) {} + + add_layer( + std::tuple<>, + LAYER_DETAILS&& layer_det + ) : add_layer(layer_det) {} + + add_layer( + std::tuple<>, + LAYER_DETAILS layer_det, + INPUT_LAYER il + ) : add_layer(layer_det,il) {} + + add_layer( + const std::tuple& layer_det + ) : add_layer(tuple_head(layer_det)) {} + + add_layer( + const std::tuple& layer_det, + INPUT_LAYER il + ) : add_layer(tuple_head(layer_det),il) {} + + template + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + input_layer_.to_tensor(ibegin, iend, data); + // make sure the input layer's to_tensor() function is implemented properly. + DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend), + "The input layer can't produce fewer output tensors than there are inputs."); + DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0, + "The number of tensors produced by the input layer must be an integer multiple of the number of input objects."); + + _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend); + data.async_copy_to_device(); + } + + + template + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + } + + + const tensor& operator() (const input_type& x) + { + return (*this)(&x, &x+1); + } + + const tensor& forward (const tensor& x) + { + DLIB_CASSERT(sample_expansion_factor() != 0, "You must call to_tensor() before this function can be used."); + DLIB_CASSERT(x.num_samples()%sample_expansion_factor() == 0); + subnet_wrapper wsub(x, grad_final, _sample_expansion_factor); + if (!this_layer_setup_called) + { + details.setup(wsub); + this_layer_setup_called = true; + } + impl::call_layer_forward(details, wsub, cached_output); + gradient_input_is_stale = true; + return private_get_output(); + } + + private: + tensor& private_get_output() const { return const_cast(cached_output); } + tensor& private_get_gradient_input() + { + if (gradient_input_is_stale) + { + gradient_input_is_stale = false; + x_grad.copy_size(private_get_output()); + x_grad = 0; + } + return x_grad; + } + void disable_output_and_gradient_getters ( + ) { get_output_and_gradient_input_disabled = true; } + public: + const tensor& get_output() const + { + if (get_output_and_gradient_input_disabled) + throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it."); + return private_get_output(); + } + tensor& get_gradient_input() + { + if (get_output_and_gradient_input_disabled) + throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it."); + return private_get_gradient_input(); + } + + const tensor& get_final_data_gradient( + ) const { return grad_final; } + + void back_propagate_error( + const tensor& x, + zero_gradients zero_grads = zero_gradients::yes + ) + { + back_propagate_error(x, private_get_gradient_input(), zero_grads); + } + void back_propagate_error( + const tensor& x, + const tensor& gradient_input, + zero_gradients zero_grads = zero_gradients::yes + ) + { + // make sure grad_final is initialized to 0 + if (!have_same_dimensions(x, grad_final)) + grad_final.copy_size(x); + grad_final = 0; + + subnet_wrapper wsub(x, grad_final, _sample_expansion_factor); + params_grad.copy_size(details.get_layer_params()); + impl::call_layer_backward(details, private_get_output(), + gradient_input, wsub, static_cast(params_grad)); + + // zero out get_gradient_input() + gradient_input_is_stale = zero_grads == zero_gradients::yes; + } + + template + void update_parameters(sstack solvers, double learning_rate) + { + DLIB_CASSERT(solvers.size()>=num_computational_layers); + // Don't try to adjust the parameters if this layer doesn't have any or the + // learning rate is disabled for this layer. + if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0) + { + const tensor& step = solvers.top()(learning_rate, details, static_cast(params_grad)); + tt::add(details.get_layer_params(), details.get_layer_params(), step); + } + } + + template + void update_parameters(std::vector& solvers, double learning_rate) + { + update_parameters(make_sstack(solvers), learning_rate); + } + + const tensor& get_parameter_gradient( + ) const { return params_grad; } + + tensor& get_parameter_gradient ( + ) { return params_grad; } + + const subnet_type& subnet() const { return input_layer_; } + subnet_type& subnet() { return input_layer_; } + + const subnet_type& input_layer() const { return input_layer_; } + subnet_type& input_layer() { return input_layer_; } + + const layer_details_type& layer_details() const { return details; } + layer_details_type& layer_details() { return details; } + + unsigned int sample_expansion_factor() const { return _sample_expansion_factor; } + + void set_gradient_inputs_to_zero() + { + gradient_input_is_stale = true; + } + + void clean() + { + x_grad.clear(); + grad_final.clear(); + cached_output.clear(); + params_grad.clear(); + temp_tensor.clear(); + gradient_input_is_stale = true; + call_clean_method_if_exists(details); + } + + friend void serialize(const add_layer& item, std::ostream& out) + { + int version = 3; + serialize(version, out); + serialize(item.input_layer_, out); + serialize(item.details, out); + serialize(item.this_layer_setup_called, out); + serialize(item.gradient_input_is_stale, out); + serialize(item.get_output_and_gradient_input_disabled, out); + serialize(item.x_grad, out); + serialize(item.cached_output, out); + serialize(item.grad_final, out); + serialize(item._sample_expansion_factor, out); + } + + friend void deserialize(add_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (!(2 <= version && version <= 3)) + throw serialization_error("Unexpected version found while deserializing dlib::add_layer."); + deserialize(item.input_layer_, in); + deserialize(item.details, in); + deserialize(item.this_layer_setup_called, in); + deserialize(item.gradient_input_is_stale, in); + deserialize(item.get_output_and_gradient_input_disabled, in); + deserialize(item.x_grad, in); + deserialize(item.cached_output, in); + deserialize(item.grad_final, in); + if (version >= 3) + deserialize(item._sample_expansion_factor, in); + else + item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here. + } + + friend std::ostream& operator<< (std::ostream& out, const add_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n"; + + // Don't print the repeat_input_layer since it doesn't exist from the user's + // point of view. It's just an artifact of how repeat<> works. + if (!std::is_same::value) + out << "layer<" << idx+1 << ">\t" << subnet() << "\n"; + } + + private: + + bool this_layer_requires_forward_output( + ) + { + subnet_wrapper wsub(grad_final, grad_final, _sample_expansion_factor); + return impl::backward_requires_forward_output(details, wsub); + } + + class subnet_wrapper + { + public: + subnet_wrapper(const tensor& x_, resizable_tensor& grad_final_, unsigned int sef) : + x(x_), grad_final(grad_final_), _sample_expansion_factor(sef) {} + + subnet_wrapper(const subnet_wrapper&) = delete; + subnet_wrapper& operator=(const subnet_wrapper&) = delete; + + unsigned int sample_expansion_factor() const { return _sample_expansion_factor;} + const tensor& get_output() const { return x; } + tensor& get_gradient_input() + { + if (!have_same_dimensions(x, grad_final)) + { + grad_final.copy_size(x); + grad_final = 0; + } + return grad_final; + } + + private: + const tensor& x; + resizable_tensor& grad_final; + unsigned int _sample_expansion_factor; + }; + + void swap(add_layer& item) + { + std::swap(input_layer_, item.input_layer_); + std::swap(details, item.details); + std::swap(this_layer_setup_called, item.this_layer_setup_called); + std::swap(gradient_input_is_stale, item.gradient_input_is_stale); + std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled); + std::swap(x_grad, item.x_grad); + std::swap(cached_output, item.cached_output); + std::swap(grad_final, item.grad_final); + std::swap(_sample_expansion_factor, item._sample_expansion_factor); + } + + subnet_type input_layer_; + LAYER_DETAILS details; + bool this_layer_setup_called; + bool gradient_input_is_stale; + bool get_output_and_gradient_input_disabled; + mutable unsigned int _sample_expansion_factor; + resizable_tensor x_grad; + resizable_tensor cached_output; + resizable_tensor grad_final; + + // The following 2 objects don't logically contribute to the state of this class. + // They are only here to prevent them from being reallocated over and over in + // member functions. + resizable_tensor params_grad; + resizable_tensor temp_tensor; + }; + +// ---------------------------------------------------------------------------------------- + + template + class add_tag_layer; + + template class tag> + struct tag_id + { + const static unsigned long id = tag::id; + }; + + template + class add_tag_layer::value>::type> + { + public: + typedef SUBNET subnet_type; + typedef typename subnet_type::input_type input_type; + typedef typename subnet_type::input_layer_type input_layer_type; + typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. + const static size_t num_layers = subnet_type::num_layers + 1; + const static size_t num_computational_layers = subnet_type::num_computational_layers; + const static unsigned long id = ID; + + add_tag_layer() {}; + add_tag_layer(const add_tag_layer&) = default; + add_tag_layer(add_tag_layer&&) = default; + add_tag_layer& operator=(add_tag_layer&&) = default; + add_tag_layer& operator=(const add_tag_layer&) = default; + + template + add_tag_layer( + const add_tag_layer& item + ) : subnetwork(item.subnet()) + {} + + template + add_tag_layer( + T ...args + ) : + subnetwork(std::move(args)...) + { + } + + template + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork.to_tensor(ibegin,iend,data); + } + + template + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + return subnetwork(ibegin,iend); + } + + const tensor& operator() (const input_type& x) + { + return subnetwork(x); + } + + const tensor& forward(const tensor& x) + { + return subnetwork.forward(x); + } + + const tensor& get_output() const { return subnetwork.get_output(); } + + tensor& get_gradient_input() + { + return subnetwork.get_gradient_input(); + } + + const tensor& get_final_data_gradient( + ) const { return subnetwork.get_final_data_gradient(); } + + void back_propagate_error( + const tensor& x, + zero_gradients zero_grads = zero_gradients::yes + ) + { + subnetwork.back_propagate_error(x, zero_grads); + } + void back_propagate_error( + const tensor& x, + const tensor& gradient_input, + zero_gradients zero_grads = zero_gradients::yes + ) + { + subnetwork.back_propagate_error(x,gradient_input, zero_grads); + } + + template + void update_parameters(sstack solvers, double learning_rate) + { + subnetwork.update_parameters(solvers, learning_rate); + } + + template + void update_parameters(std::vector& solvers, double learning_rate) + { + update_parameters(make_sstack(solvers), learning_rate); + } + + const tensor& get_parameter_gradient( + ) const { return params_grad; } + + tensor& get_parameter_gradient ( + ) { return params_grad; } + + const subnet_type& subnet() const { return subnetwork; } + subnet_type& subnet() { return subnetwork; } + + const input_layer_type& input_layer() const { return subnet().input_layer(); } + input_layer_type& input_layer() { return subnet().input_layer(); } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + void set_gradient_inputs_to_zero() + { + subnetwork.set_gradient_inputs_to_zero(); + } + + void clean() + { + subnetwork.clean(); + } + + friend void serialize(const add_tag_layer& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.subnetwork, out); + } + + friend void deserialize(add_tag_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer."); + deserialize(item.subnetwork, in); + } + + friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << "tag" << ID << "\n"; + subnet().print(out, idx+1, min_length); + } + + private: + + template + friend class add_layer; + template + friend class dimpl::subnet_wrapper; + template + friend class add_tag_layer; + template class T, typename U> + friend class add_skip_layer; + template class L, typename S> + friend class repeat; + + // You wouldn't put a tag on a layer if you didn't want to access its forward + // outputs. So this is always true. + bool this_layer_requires_forward_output( + ) { return true; } + + void disable_output_and_gradient_getters ( + ) + { + // This should never happen because only inplace layers call + // disable_output_and_gradient_getters(), however, putting a tag layer right + // before an inplace layer basically means you don't want the following layer + // to operate in place. So the inplace layer should turn itself into an + // out-of-place layer and not call disable_output_and_gradient_getters(). + DLIB_CASSERT(false,"This should never happen"); + } + + tensor& private_get_output() const + { return subnetwork.private_get_output(); } + tensor& private_get_gradient_input() + { return subnetwork.private_get_gradient_input(); } + + subnet_type subnetwork; + + // This member doesn't logically contribute to the state of the object since it is + // always empty. It's just here so we can have the get_parameter_gradient() methods + // which have to return something. So they return this empty tensor. + resizable_tensor params_grad; + }; + +// ---------------------------------------------------------------------------------------- + + template + struct decorator_repeat_group + { + decorator_repeat_group( + T&& ...args + ) : data(std::forward(args)...) {} + + std::tuple data; + }; + template + decorator_repeat_group repeat_group ( + T&& ...args + ) + { + return decorator_repeat_group(std::forward(args)...); + } + + template < + size_t num, + template class REPEATED_LAYER, + typename SUBNET + > + class repeat + { + static_assert(num > 0, "You can't have a layer repeated 0 times."); + public: + typedef SUBNET subnet_type; + typedef typename SUBNET::input_type input_type; + typedef typename subnet_type::input_layer_type input_layer_type; + typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. + const static size_t comp_layers_in_each_group = (REPEATED_LAYER::num_computational_layers-SUBNET::num_computational_layers); + const static size_t comp_layers_in_repeated_group = comp_layers_in_each_group*num; + const static size_t num_computational_layers = comp_layers_in_repeated_group + SUBNET::num_computational_layers; + + const static size_t layers_in_each_group = (REPEATED_LAYER::num_layers-SUBNET::num_layers); + const static size_t layers_in_repeated_group = layers_in_each_group*num; + const static size_t num_layers = subnet_type::num_layers + layers_in_repeated_group; + + + typedef REPEATED_LAYER repeated_layer_type; + + repeat( + ) : + details(num) + { + } + + size_t num_repetitions ( + ) const { return num; } + + const repeated_layer_type& get_repeated_layer ( + size_t i + ) const + { + DLIB_CASSERT(i < num_repetitions()); + return details[i]; + } + + repeated_layer_type& get_repeated_layer ( + size_t i + ) + { + DLIB_CASSERT(i < num_repetitions()); + return details[i]; + } + + repeat(const repeat&) = default; + repeat(repeat&&) = default; + repeat& operator=(repeat&&) = default; + repeat& operator=(const repeat&) = default; + + template class T, typename U> + repeat( + const repeat& item + ) : + subnetwork(item.subnetwork) + { + for (auto&& d : item.details) + details.emplace_back(d); + } + + template + repeat( + T arg1, + U ...args2 + ): + details(num, std::move(arg1)), + subnetwork(std::move(args2)...) + { + } + + template + repeat( + decorator_repeat_group&& arg1, + U ...args2 + ): + details(num, arg1.data), + subnetwork(std::move(args2)...) + { + } + + template + repeat( + std::tuple<>, + T arg1, + U ...args2 + ): + details(num, std::move(arg1)), + subnetwork(std::move(args2)...) + { + } + + template + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork.to_tensor(ibegin,iend,data); + // call to_tensor on the networks in details just to populate the + // _sample_expansion_factor values in those networks. Other than that this + // call is a noop. + for (auto& d : details) + d.to_tensor(ibegin, iend, data); + } + + template + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + } + + const tensor& operator() (const input_type& x) + { + return (*this)(&x, &x+1); + } + + const tensor& forward(const tensor& x) + { + subnetwork.forward(x); + details[details.size()-1].forward(subnetwork.get_output()); + for (long i = details.size()-2; i >= 0; --i) + details[i].forward(details[i+1].get_output()); + return private_get_output(); + } + + private: + tensor& private_get_output() const + { + return details[0].private_get_output(); + } + tensor& private_get_gradient_input() + { + return details[0].private_get_gradient_input(); + } + public: + const tensor& get_output() const + { + return details[0].get_output(); + } + tensor& get_gradient_input() + { + return details[0].get_gradient_input(); + } + + const tensor& get_final_data_gradient( + ) const { return subnetwork.get_final_data_gradient(); } + + const tensor& get_parameter_gradient( + ) const { return details[0].get_parameter_gradient(); } + + tensor& get_parameter_gradient ( + ) { return details[0].get_parameter_gradient(); } + + void back_propagate_error( + const tensor& x, + zero_gradients zero_grads = zero_gradients::yes + ) + { + back_propagate_error(x, private_get_gradient_input(), zero_grads); + } + void back_propagate_error( + const tensor& x, + const tensor& gradient_input, + zero_gradients zero_grads = zero_gradients::yes + ) + { + if (details.size() > 1) + { + details[0].back_propagate_error(details[1].get_output(), gradient_input, zero_grads); + for (size_t i = 1; i < details.size(); ++i) + { + if (i+1 < details.size()) + details[i].back_propagate_error(details[i+1].get_output(), details[i-1].get_final_data_gradient(), zero_grads); + else + details[i].back_propagate_error(subnetwork.get_output(), details[i-1].get_final_data_gradient(), zero_grads); + } + } + else + { + details[0].back_propagate_error(subnetwork.get_output(), gradient_input, zero_grads); + } + subnetwork.back_propagate_error(x, details.back().get_final_data_gradient(), zero_grads); + } + + template + void update_parameters(sstack solvers, double learning_rate) + { + for (size_t i = 0; i < details.size(); ++i) + details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),learning_rate); + subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),learning_rate); + } + + template + void update_parameters(std::vector& solvers, double learning_rate) + { + update_parameters(make_sstack(solvers), learning_rate); + } + + const subnet_type& subnet() const { return subnetwork; } + subnet_type& subnet() { return subnetwork; } + + const input_layer_type& input_layer() const { return subnet().input_layer(); } + input_layer_type& input_layer() { return subnet().input_layer(); } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + void set_gradient_inputs_to_zero() + { + subnetwork.set_gradient_inputs_to_zero(); + } + + void clean() + { + temp_tensor.clear(); + subnetwork.clean(); + for (auto&& d : details) + d.clean(); + } + + friend void serialize(const repeat& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.details, out); + serialize(item.subnetwork, out); + } + + friend void deserialize(repeat& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::repeat."); + deserialize(item.details, in); + deserialize(item.subnetwork, in); + } + + friend std::ostream& operator<< (std::ostream& out, const repeat& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + for (size_t i = 0; i < num_repetitions(); ++i) + { + get_repeated_layer(i).print(out, idx, min_length); + idx += layers_in_each_group; + } + subnet().print(out, idx, min_length); + } + private: + + + template + friend class add_layer; + template + friend class dimpl::subnet_wrapper; + template + friend class add_tag_layer; + template class T, typename U> + friend class add_skip_layer; + template class L, typename S> + friend class repeat; + + bool this_layer_requires_forward_output( + ) + { + return details[0].this_layer_requires_forward_output(); + } + + void disable_output_and_gradient_getters ( + ) + { + details[0].disable_output_and_gradient_getters(); + } + + + std::vector details; + subnet_type subnetwork; + + // temp_tensor doesn't logically contribute to the state of this class. + // It is here only to void needing to reallocate it over and over. + resizable_tensor temp_tensor; + }; + + template < + size_t num, + template class REPEATED_LAYER, + typename SUBNET + > + struct is_nonloss_layer_type> : std::true_type {}; + +// ---------------------------------------------------------------------------------------- + +// This version of add_tag_layer handles the special case where the subnetwork being given +// is just an input layer object. + template + class add_tag_layer + { + public: + typedef INPUT_LAYER subnet_type; + typedef typename subnet_type::input_type input_type; + typedef INPUT_LAYER input_layer_type; + typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. + const static size_t num_computational_layers = 0; + const static size_t num_layers = 2; + const static unsigned long id = ID; + + add_tag_layer():cached_output_ptr(nullptr),gradient_input_is_stale(true),_sample_expansion_factor(0) {} + + add_tag_layer(const add_tag_layer&) = default; + add_tag_layer& operator=(const add_tag_layer&) = default; + add_tag_layer(add_tag_layer&& item) : add_tag_layer() { swap(item); } + add_tag_layer& operator=(add_tag_layer&& item) { swap(item); return *this; } + + template + add_tag_layer( + const add_tag_layer& item + ) : input_layer_(item.subnet()), + cached_output(item.cached_output), + cached_output_ptr(nullptr), + grad_final(item.grad_final), + gradient_input_is_stale(item.gradient_input_is_stale), + _sample_expansion_factor(0) + {} + + template + add_tag_layer( + T ...args + ) : + input_layer_(std::move(args)...), + cached_output_ptr(nullptr), + gradient_input_is_stale(true), + _sample_expansion_factor(0) + { + } + + add_tag_layer ( + std::tuple<> + ) : + cached_output_ptr(nullptr), + gradient_input_is_stale(true), + _sample_expansion_factor(0) + {} + + template + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + input_layer_.to_tensor(ibegin,iend,data); + + // make sure the input layer's to_tensor() function is implemented properly. + DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend), + "The input layer can't produce fewer output tensors than there are inputs."); + DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0, + "The number of tensors produced by the input layer must be an integer multiple of the number of input objects."); + + _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend); + data.async_copy_to_device(); + } + + unsigned int sample_expansion_factor() const { return _sample_expansion_factor; } + + template + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + input_layer_.to_tensor(ibegin,iend,cached_output); + cached_output_ptr = nullptr; + return get_output(); + } + + const tensor& operator() (const input_type& x) + { + return (*this)(&x, &x+1); + } + + const tensor& forward(const tensor& x) + { + // If this tag is the first layer in one of the sub networks inside a repeat + // layer then we don't want it to be creating copies of x. This is because, we + // can just hold a pointer to x since the way repeat is constructed guarantees + // that x will have a lifetime larger than this pointer. + if (is_same_type::value) + cached_output_ptr = const_cast(&x); + else + cached_output = x; + gradient_input_is_stale = true; + return get_output(); + } + + const tensor& get_output() const + { + if (cached_output_ptr) + return *cached_output_ptr; + else + return cached_output; + } + + const tensor& get_final_data_gradient( + ) const { return grad_final; } + + tensor& get_gradient_input() + { + if (!have_same_dimensions(get_output(), grad_final) || + gradient_input_is_stale) + { + grad_final.copy_size(get_output()); + grad_final = 0; + gradient_input_is_stale = false; + } + return grad_final; + } + + + void back_propagate_error( + const tensor& /*x*/, + zero_gradients /*zero_grads*/ = zero_gradients::yes + ) + { + // nothing to do + } + void back_propagate_error( + const tensor& /*x*/, + const tensor& /*gradient_input*/, + zero_gradients /*zero_grads*/ = zero_gradients::yes + ) + { + // nothing to do + } + + template + void update_parameters(sstack /*solvers*/, double /*learning_rate*/) + { + // nothing to do + } + + template + void update_parameters(std::vector& solvers, double learning_rate) + { + update_parameters(make_sstack(solvers), learning_rate); + } + + const subnet_type& subnet() const { return input_layer_; } + subnet_type& subnet() { return input_layer_; } + + const input_layer_type& input_layer() const { return input_layer_; } + input_layer_type& input_layer() { return input_layer_; } + + void set_gradient_inputs_to_zero() + { + // nothing to do + } + + void clean() + { + grad_final.clear(); + cached_output.clear(); + cached_output_ptr = 0; + } + + friend void serialize(const add_tag_layer& item, std::ostream& out) + { + int version = 2; + serialize(version, out); + serialize(item.input_layer_, out); + serialize(item.cached_output, out); + serialize(item.grad_final, out); + serialize(item.gradient_input_is_stale, out); + serialize(item._sample_expansion_factor, out); + } + + friend void deserialize(add_tag_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (!(1 <= version && version <= 2)) + throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer."); + deserialize(item.input_layer_, in); + deserialize(item.cached_output, in); + deserialize(item.grad_final, in); + deserialize(item.gradient_input_is_stale, in); + item.cached_output_ptr = nullptr; + if (version >= 2) + deserialize(item._sample_expansion_factor, in); + else + item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here. + + } + + friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<"<\t"< works. + if (!std::is_same::value) + out << "layer<"<< idx+1 << ">\t" << subnet() << "\n"; + } + + private: + + template + friend class add_layer; + template + friend class dimpl::subnet_wrapper; + template + friend class add_tag_layer; + template class T, typename U> + friend class add_skip_layer; + template class L, typename S> + friend class repeat; + + // You woudln't put a tag on a layer if you didn't want to access its forward + // outputs. So this is always true. + bool this_layer_requires_forward_output( + ) { return true; } + + void disable_output_and_gradient_getters ( + ) + { + // This should never happen because only inplace layers call + // disable_output_and_gradient_getters(), however, putting a tag layer right + // before an inplace layer basically means you don't want the following layer + // to operate in place. So the inplace layer should turn itself into an + // out-of-place layer and not call disable_output_and_gradient_getters(). + DLIB_CASSERT(false,"This should never happen"); + } + + tensor& private_get_output() const + { return const_cast(get_output()); } + tensor& private_get_gradient_input() + { return get_gradient_input(); } + + void swap(add_tag_layer& item) + { + std::swap(input_layer_, item.input_layer_); + std::swap(cached_output, item.cached_output); + std::swap(cached_output_ptr, item.cached_output_ptr); + std::swap(grad_final, item.grad_final); + std::swap(gradient_input_is_stale, item.gradient_input_is_stale); + std::swap(_sample_expansion_factor, item._sample_expansion_factor); + } + + subnet_type input_layer_; + resizable_tensor cached_output; + tensor* cached_output_ptr; + resizable_tensor grad_final; + bool gradient_input_is_stale; + mutable unsigned int _sample_expansion_factor; + }; + + template + struct is_nonloss_layer_type> : std::true_type {}; + + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template + class add_loss_layer; + + class no_label_type + { + private: + // We don't want anyone making these no_label_type objects. They are here only to + // allow add_loss_layer::training_label_type and dnn_trainer::training_label_type + // to exist which avoids needing to overload add_loss_layer and dnn_trainer for + // supervised an unsupervised losses. It also can be a type to use in template + // metaprogramming to indicate "no label". So here we make the constructor private + // with the exception that add_loss_layer objects can make it (again, just to + // simplify add_loss_layer's implementation). + no_label_type(){}; + template friend class add_loss_layer; + template < typename net_type, typename solver_type > friend class dnn_trainer; + }; + +// ---------------------------------------------------------------------------------------- + + template + class add_loss_layer + { + template + struct get_loss_layer_training_label_type + { + typedef no_label_type type; + }; + template + struct get_loss_layer_training_label_type::type> + { + typedef typename T::training_label_type type; + }; + + template + struct get_loss_layer_output_label_type + { + typedef no_label_type type; + }; + template + struct get_loss_layer_output_label_type::type> + { + typedef typename T::output_label_type type; + }; + + public: + typedef LOSS_DETAILS loss_details_type; + typedef SUBNET subnet_type; + typedef typename subnet_type::input_type input_type; + typedef typename subnet_type::input_layer_type input_layer_type; + const static size_t num_layers = subnet_type::num_layers + 1; + // Note that the loss layer doesn't count as an additional computational layer. + const static size_t num_computational_layers = subnet_type::num_computational_layers; + typedef typename get_loss_layer_training_label_type::type training_label_type; + typedef typename get_loss_layer_output_label_type::type output_label_type; + + static_assert(is_nonloss_layer_type::value, + "SUBNET must be of type add_layer, add_skip_layer, or add_tag_layer."); + + + add_loss_layer() {}; + add_loss_layer(const add_loss_layer&) = default; + add_loss_layer& operator=(const add_loss_layer&) = default; + add_loss_layer(add_loss_layer&& item) : add_loss_layer() { swap(item); } + add_loss_layer& operator=(add_loss_layer&& item) { swap(item); return *this; } + + template + add_loss_layer( + const add_loss_layer& item + ) : + loss(item.loss_details()), + subnetwork(item.subnet()) + {} + + template + add_loss_layer( + const LOSS_DETAILS& layer_det, + T&& ...args + ) : + loss(layer_det), + subnetwork(std::forward(args)...) + { + } + + template + add_loss_layer( + LOSS_DETAILS&& layer_det, + T&& ...args + ) : + loss(std::move(layer_det)), + subnetwork(std::forward(args)...) + { + } + + template + struct disable_forwarding_constr + { + const static bool value = std::is_constructible::value; + }; + template + struct disable_forwarding_constr> + { + const static bool value = true; + }; + + template < + typename ...T, + typename = typename std::enable_if::type...>::value>::type + > + add_loss_layer( + T&& ...args + ) : + subnetwork(std::forward(args)...) + { + } + + template + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork.to_tensor(ibegin,iend,data); + } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + template + void operator() ( + const tensor& x, + output_iterator obegin + ) + { + subnetwork.forward(x); + const dimpl::subnet_wrapper wsub(subnetwork); + loss.to_label(x, wsub, obegin); + } + + template + void operator() ( + forward_iterator ibegin, + forward_iterator iend, + output_iterator obegin + ) + { + to_tensor(ibegin,iend,temp_tensor); + (*this)(temp_tensor, obegin); + } + + const output_label_type& operator() (const input_type& x) + { + (*this)(&x, &x+1, &temp_label); + return temp_label; + } + + template + const output_label_type& process (const input_type& x, T&& ...args) + { + to_tensor(&x,&x+1,temp_tensor); + subnetwork.forward(temp_tensor); + const dimpl::subnet_wrapper wsub(subnetwork); + loss.to_label(temp_tensor, wsub, &temp_label, std::forward(args)...); + return temp_label; + } + + template + std::vector process_batch (const iterable_type& data, size_t batch_size, T&& ...args) + { + std::vector results(std::distance(data.begin(), data.end())); + auto o = results.begin(); + auto i = data.begin(); + auto num_remaining = results.size(); + while(num_remaining != 0) + { + auto inc = std::min(batch_size, num_remaining); + to_tensor(i,i+inc,temp_tensor); + subnetwork.forward(temp_tensor); + const dimpl::subnet_wrapper wsub(subnetwork); + loss.to_label(temp_tensor, wsub, o, std::forward(args)...); + + i += inc; + o += inc; + num_remaining -= inc; + } + return results; + } + + void back_propagate_error( + const tensor& x, + zero_gradients zero_grads = zero_gradients::yes + ) + { + subnet().back_propagate_error(x, zero_grads); + } + + void back_propagate_error( + const tensor& x, + const tensor& gradient_input, + zero_gradients zero_grads = zero_gradients::yes + ) + { + subnet().back_propagate_error(x, gradient_input, zero_grads); + } + + const tensor& get_final_data_gradient( + ) const + { + return subnet().get_final_data_gradient(); + } + + const tensor& forward(const tensor& x) + { + return subnet().forward(x); + } + + template + std::vector operator() ( + const iterable_type& data, + size_t batch_size = 128 + ) + { + std::vector results(std::distance(data.begin(), data.end())); + auto o = results.begin(); + auto i = data.begin(); + auto num_remaining = results.size(); + while(num_remaining != 0) + { + auto inc = std::min(batch_size, num_remaining); + (*this)(i, i+inc, o); + i += inc; + o += inc; + num_remaining -= inc; + } + return results; + } + + template + double compute_loss ( + const tensor& x, + label_iterator lbegin + ) + { + subnetwork.forward(x); + dimpl::subnet_wrapper wsub(subnetwork); + return loss.compute_loss_value_and_gradient(x, lbegin, wsub); + } + + template + double compute_loss ( + forward_iterator ibegin, + forward_iterator iend, + label_iterator lbegin + ) + { + to_tensor(ibegin,iend,temp_tensor); + return compute_loss(temp_tensor, lbegin); + } + + double compute_loss ( + const tensor& x + ) + { + subnetwork.forward(x); + dimpl::subnet_wrapper wsub(subnetwork); + return loss.compute_loss_value_and_gradient(x, wsub); + } + + template + double compute_loss ( + forward_iterator ibegin, + forward_iterator iend + ) + { + to_tensor(ibegin,iend,temp_tensor); + return compute_loss(temp_tensor); + } + + template + double compute_parameter_gradients ( + const tensor& x, + label_iterator lbegin, + zero_gradients zero_grads = zero_gradients::yes + ) + { + subnetwork.forward(x); + dimpl::subnet_wrapper wsub(subnetwork); + double l = loss.compute_loss_value_and_gradient(x, lbegin, wsub); + subnetwork.back_propagate_error(x, zero_grads); + return l; + } + template + double compute_parameter_gradients ( + forward_iterator ibegin, + forward_iterator iend, + label_iterator lbegin, + zero_gradients zero_grads = zero_gradients::yes + ) + { + to_tensor(ibegin,iend,temp_tensor); + return compute_parameter_gradients(temp_tensor, lbegin, zero_grads); + } + double compute_parameter_gradients ( + const tensor& x, + zero_gradients zero_grads = zero_gradients::yes + ) + { + subnetwork.forward(x); + dimpl::subnet_wrapper wsub(subnetwork); + double l = loss.compute_loss_value_and_gradient(x, wsub); + subnetwork.back_propagate_error(x, zero_grads); + return l; + } + template + double compute_parameter_gradients ( + forward_iterator ibegin, + forward_iterator iend, + zero_gradients zero_grads = zero_gradients::yes + ) + { + to_tensor(ibegin,iend,temp_tensor); + return compute_parameter_gradients(temp_tensor, zero_grads); + } + + template + void update_parameters ( + sstack solvers, + double learning_rate + ) + { + subnetwork.update_parameters(solvers, learning_rate); + } + + template + void update_parameters(std::vector& solvers, double learning_rate) + { + update_parameters(make_sstack(solvers), learning_rate); + } + + const subnet_type& subnet() const { return subnetwork; } + subnet_type& subnet() { return subnetwork; } + + const input_layer_type& input_layer() const { return subnet().input_layer(); } + input_layer_type& input_layer() { return subnet().input_layer(); } + + const loss_details_type& loss_details() const { return loss; } + loss_details_type& loss_details() { return loss; } + + void set_gradient_inputs_to_zero ( + ) + { + subnetwork.set_gradient_inputs_to_zero(); + } + + void clean ( + ) + { + temp_tensor.clear(); + subnetwork.clean(); + } + + template + friend void serialize(const add_loss_layer& item, std::ostream& out); + template + friend void deserialize(add_loss_layer& item, std::istream& in); + + friend std::ostream& operator<< (std::ostream& out, const add_loss_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t" << loss_details() << "\n"; + subnet().print(out, idx+1, min_length); + } + + private: + + + void swap(add_loss_layer& item) + { + std::swap(loss, item.loss); + std::swap(subnetwork, item.subnetwork); + } + + loss_details_type loss; + subnet_type subnetwork; + + // These two objects don't logically contribute to the state of this object. They + // are here to prevent them from being reallocated over and over. + output_label_type temp_label; + resizable_tensor temp_tensor; + }; + + template + void serialize(const add_loss_layer& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.loss, out); + serialize(item.subnetwork, out); + } + + template + void deserialize(add_loss_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::add_loss_layer."); + deserialize(item.loss, in); + deserialize(item.subnetwork, in); + } + + + template + struct is_loss_layer_type> : std::true_type {}; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template + struct layer_helper + { + static_assert(i < T::num_layers, "Call to layer() attempted to access non-existing layer in neural network."); + static T& makeT(); + // If you get error here mentioning lack of member "subnet" in "dlib::input<...>", + // then likely your "dlib::layer<...>" invocation wasn't able to find requested layer. + // This could happen for instance when trying to use skip layer for non-existing tag. + using next_type = typename std::remove_reference::type; + using type = typename layer_helper::type; + static type& layer(T& n) + { + return layer_helper::layer(n.subnet()); + } + }; + template < + unsigned int i, + size_t N, template class L, typename S + > + struct layer_helper, typename std::enable_if<(i!=0&&i>=repeat::layers_in_repeated_group)>::type> + { + const static size_t layers_in_repeated_group = repeat::layers_in_repeated_group; + + static repeat& makeT(); + using next_type = typename std::remove_reference::type; + using type = typename layer_helper::type; + static type& layer(repeat& n) + { + return layer_helper::layer(n.subnet()); + } + }; + template < + unsigned int i, + size_t N, template class L, typename S + > + struct layer_helper, typename std::enable_if<(i!=0&&i::layers_in_repeated_group)>::type> + { + const static size_t layers_in_each_group = repeat::layers_in_each_group; + typedef typename repeat::repeated_layer_type repeated_layer_type; + using next_type = repeated_layer_type; + using type = typename layer_helper::type; + static type& layer(repeat& n) + { + return layer_helper::layer(n.get_repeated_layer(i/layers_in_each_group)); + } + }; + template < + size_t N, template class L, typename S + > + struct layer_helper<0,repeat, void> + { + typedef typename repeat::repeated_layer_type repeated_layer_type; + using type = repeated_layer_type; + static type& layer(repeat& n) + { + return n.get_repeated_layer(0); + } + }; + + + + template < + unsigned int i, + size_t N, template class L, typename S + > + struct layer_helper, typename std::enable_if<(i!=0&&i>=repeat::layers_in_repeated_group)>::type> + { + const static size_t layers_in_repeated_group = repeat::layers_in_repeated_group; + + static const repeat& makeT(); + using next_type = const typename std::remove_reference::type; + using type = const typename layer_helper::type; + static type& layer(const repeat& n) + { + return layer_helper::layer(n.subnet()); + } + }; + template < + unsigned int i, + size_t N, template class L, typename S + > + struct layer_helper, typename std::enable_if<(i!=0&&i::layers_in_repeated_group)>::type> + { + const static size_t layers_in_each_group = repeat::layers_in_each_group; + typedef typename repeat::repeated_layer_type repeated_layer_type; + using next_type = const repeated_layer_type; + using type = const typename layer_helper::type; + static type& layer(const repeat& n) + { + return layer_helper::layer(n.get_repeated_layer(i/layers_in_each_group)); + } + }; + template < + size_t N, template class L, typename S + > + struct layer_helper<0,const repeat, void> + { + typedef typename repeat::repeated_layer_type repeated_layer_type; + using type = const repeated_layer_type; + static type& layer(const repeat& n) + { + return n.get_repeated_layer(0); + } + }; + + + + template + struct layer_helper<0,T,void> + { + using type = T; + static type& layer(T& n) + { + return n; + } + }; + + template class Match, typename T, unsigned int i, typename enabled = void> + struct layer_helper_match + { + static T& makeT(); + using next_type = typename std::remove_reference::type; + using type = typename layer_helper_match::type; + static type& layer(T& n) + { + return layer_helper_match::layer(n.subnet()); + } + }; + // This overload catches add_layer and add_loss_layer templates. + template class Match, typename T, unsigned int i> + struct layer_helper_match>::value>::type> + { + using type = typename layer_helper::type; + static type& layer(T& n) + { + return layer_helper::layer(n); + } + }; + // This overload catches input templates. + template class Match, typename T, unsigned int i> + struct layer_helper_match>::value>::type> + { + using type = typename layer_helper::type; + static type& layer(T& n) + { + return layer_helper::layer(n); + } + }; + // This overload catches subnet_wrapper templates. + template class Match, typename T, unsigned int i> + struct layer_helper_match>::value>::type> + { + using type = typename layer_helper::type; + static type& layer(T& n) + { + return layer_helper::layer(n); + } + }; + } + + template + typename impl::layer_helper::type& layer (T& n) + { + return impl::layer_helper::layer(n); + } + + template class Match, typename T> + typename impl::layer_helper_match::type& layer (T& n) + { + return impl::layer_helper_match::layer(n); + } + + template class Match, unsigned int i, typename T> + typename impl::layer_helper_match::type& layer (T& n) + { + return impl::layer_helper_match::layer(n); + } + +// ---------------------------------------------------------------------------------------- + + template + typename net_type::input_layer_type& input_layer ( + net_type& net + ) + { + return net.input_layer(); + } + + template + const typename net_type::input_layer_type& input_layer ( + const net_type& net + ) + { + return net.input_layer(); + } + +// ---------------------------------------------------------------------------------------- + + template class TAG_TYPE, typename SUBNET> + class add_skip_layer + { + public: + typedef SUBNET subnet_type; + typedef typename subnet_type::input_type input_type; + typedef typename subnet_type::input_layer_type input_layer_type; + typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. + const static size_t num_layers = subnet_type::num_layers + 1; + const static size_t num_computational_layers = subnet_type::num_computational_layers; + const static unsigned long id = tag_id::id; + + add_skip_layer() {}; + add_skip_layer(const add_skip_layer&) = default; + add_skip_layer(add_skip_layer&&) = default; + add_skip_layer& operator=(add_skip_layer&&) = default; + add_skip_layer& operator=(const add_skip_layer&) = default; + + template + add_skip_layer( + const add_skip_layer& item + ) : subnetwork(item.subnet()) + {} + + template + add_skip_layer( + T ...args + ) : + subnetwork(std::move(args)...) + { + } + + template + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork.to_tensor(ibegin,iend,data); + } + + template + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + subnetwork(ibegin,iend); + return layer(subnetwork).get_output(); + } + + const tensor& operator() (const input_type& x) + { + subnetwork(x); + return layer(subnetwork).get_output(); + } + + const tensor& forward(const tensor& x) + { + subnetwork.forward(x); + return layer(subnetwork).get_output(); + } + + const tensor& get_output() const + { + return layer(subnetwork).get_output(); + } + + tensor& get_gradient_input() + { + return layer(subnetwork).get_gradient_input(); + } + + const tensor& get_final_data_gradient( + ) const + { + return subnetwork.get_final_data_gradient(); + } + + void back_propagate_error( + const tensor& x, + zero_gradients zero_grads = zero_gradients::yes + ) + { + subnetwork.back_propagate_error(x, zero_grads); + } + + template + void update_parameters(sstack solvers, double learning_rate) + { + subnetwork.update_parameters(solvers, learning_rate); + } + + template + void update_parameters(std::vector& solvers, double learning_rate) + { + update_parameters(make_sstack(solvers), learning_rate); + } + + const tensor& get_parameter_gradient( + ) const { return params_grad; } + + tensor& get_parameter_gradient ( + ) { return params_grad; } + + + const subnet_type& subnet() const + { + return subnetwork; + } + + subnet_type& subnet() + { + return subnetwork; + } + + const input_layer_type& input_layer() const { return subnet().input_layer(); } + input_layer_type& input_layer() { return subnet().input_layer(); } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + void set_gradient_inputs_to_zero() + { + subnetwork.set_gradient_inputs_to_zero(); + } + + void clean() + { + subnetwork.clean(); + } + + friend void serialize(const add_skip_layer& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.subnetwork, out); + } + + friend void deserialize(add_skip_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::add_skip_layer."); + deserialize(item.subnetwork, in); + } + + friend std::ostream& operator<< (std::ostream& out, const add_skip_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t"< + friend class add_layer; + template + friend class dimpl::subnet_wrapper; + template + friend class add_tag_layer; + template class T, typename U> + friend class add_skip_layer; + template class L, typename S> + friend class repeat; + + bool this_layer_requires_forward_output( + ) { return layer(subnetwork).this_layer_requires_forward_output(); } + + void disable_output_and_gradient_getters ( + ) { layer(subnetwork).disable_output_and_gradient_getters(); } + + tensor& private_get_output() const + { return layer(subnetwork).private_get_output(); } + tensor& private_get_gradient_input() + { return layer(subnetwork).private_get_gradient_input(); } + + subnet_type subnetwork; + + // This member doesn't logically contribute to the state of the object since it is + // always empty. It's just here so we can have the get_parameter_gradient() methods + // which have to return something. So they return this empty tensor. + resizable_tensor params_grad; + }; + template class T, typename U> + struct is_nonloss_layer_type> : std::true_type {}; + + template using tag1 = add_tag_layer< 1, SUBNET>; + template using tag2 = add_tag_layer< 2, SUBNET>; + template using tag3 = add_tag_layer< 3, SUBNET>; + template using tag4 = add_tag_layer< 4, SUBNET>; + template using tag5 = add_tag_layer< 5, SUBNET>; + template using tag6 = add_tag_layer< 6, SUBNET>; + template using tag7 = add_tag_layer< 7, SUBNET>; + template using tag8 = add_tag_layer< 8, SUBNET>; + template using tag9 = add_tag_layer< 9, SUBNET>; + template using tag10 = add_tag_layer<10, SUBNET>; + + template using skip1 = add_skip_layer< tag1, SUBNET>; + template using skip2 = add_skip_layer< tag2, SUBNET>; + template using skip3 = add_skip_layer< tag3, SUBNET>; + template using skip4 = add_skip_layer< tag4, SUBNET>; + template using skip5 = add_skip_layer< tag5, SUBNET>; + template using skip6 = add_skip_layer< tag6, SUBNET>; + template using skip7 = add_skip_layer< tag7, SUBNET>; + template using skip8 = add_skip_layer< tag8, SUBNET>; + template using skip9 = add_skip_layer< tag9, SUBNET>; + template using skip10 = add_skip_layer; + +// ---------------------------------------------------------------------------------------- + + namespace timpl + { + inline void fill_with_gassuan_random_numbers ( + tensor& t, + dlib::rand& rnd, + double sigma = 1 + ) + { + float* data = t.host(); + for (size_t i = 0; i < t.size(); ++i) + data[i] = rnd.get_random_gaussian()*sigma; + } + + struct test_layer_params + { + /*! + WHAT THIS OBJECT REPRESENTS + This object allows specifying constraints on tensor dimensions + when testing layers with test_layer(). + + If a member is set to 0 (the default), the dimension is chosen randomly + during testing. If a member is strictly positive, the dimension is fixed + to that value for all iterations. + + This is useful for layers with intrinsic constraints (e.g., k() must be 1 + or nc() must equal a specific d_model). + !*/ + long num_samples = 0; + long k = 0; + long nr = 0; + long nc = 0; + }; + + class test_layer_subnet + { + public: + test_layer_subnet ( + dlib::rand& rnd_ + ) : rnd(rnd_) + { + init(test_layer_params()); + } + + test_layer_subnet( + dlib::rand& rnd_, + const test_layer_params& p + ) : rnd(rnd_) + { + init(p); + } + + + tensor& get_mutable_output() { return output; } + const tensor& get_output() const { return output; } + const tensor& private_get_output() const { return get_output(); } + const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; } + + tensor& get_gradient_input() { return gradient_input; } + tensor& private_get_gradient_input() { return get_gradient_input(); } + test_layer_subnet& subnet() { init_sub(); return *subnetwork; } + + + + unsigned long count_outputs() const + { + if (subnetwork) + return subnetwork->count_outputs() + output.size(); + else + return output.size(); + } + + float& get_output_element(unsigned long i) + { + if (i < output.size()) + return output.host()[i]; + else + return subnet().get_output_element(i-output.size()); + } + + float get_gradient_input_element(unsigned long i) const + { + if (i < gradient_input.size()) + return gradient_input.host()[i]; + else + return subnet().get_gradient_input_element(i-gradient_input.size()); + } + + + private: + void init(const test_layer_params& p) + { + // If a dimension is fixed in p, use it. Otherwise, generate random dimensions. + const long num_samples = p.num_samples != 0 ? p.num_samples : (rnd.get_random_32bit_number() % 4 + 3); + const long k = p.k != 0 ? p.k : (rnd.get_random_32bit_number() % 4 + 2); + const long nr = p.nr != 0 ? p.nr : (((rnd.get_random_32bit_number() % 4) / 2) * 2 + 2); + const long nc = p.nc != 0 ? p.nc : (((rnd.get_random_32bit_number() % 4) / 2) * 2 + 2); + + output.set_size(num_samples, k, nr, nc); + gradient_input.set_size(num_samples, k, nr, nc); + + // Use a non-zero initial gradient to make sure the layers add to it + // rather than assign and blow away the initial value. + fill_with_gassuan_random_numbers(gradient_input, rnd, 0.01); + + fill_with_gassuan_random_numbers(output, rnd); + } + + // We lazily initialize sub-layers as needed when someone tries to call + // subnet() + void init_sub() const + { + if (!subnetwork) + subnetwork.reset(new test_layer_subnet(rnd)); + } + + dlib::rand& rnd; + mutable std::unique_ptr subnetwork; + resizable_tensor output; + resizable_tensor gradient_input; + }; + + } + + struct layer_test_results + { + layer_test_results() : was_good(true) {} + explicit layer_test_results(const std::string& l) : log(l),was_good(false) {} + + std::string log; + bool was_good; + + operator bool() const { return was_good; } + }; + + inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item) + { + out << item.log; + return out; + } + + template < + typename layer_details_type + > + layer_test_results impl_test_layer ( + layer_details_type l, + const float base_eps, + const timpl::test_layer_params& p + ) + { + using namespace timpl; + // Do some setup + running_stats rs_data, rs_params; + dlib::rand rnd; + std::ostringstream sout; + for (int iter = 0; iter < 10; ++iter) + { + // Pass the test_layer_params to the subnet constructor + test_layer_subnet subnetwork(rnd, p); + resizable_tensor output, out2, out3; + // Run setup() and forward() as well to make sure any calls to subnet() have + // happened before we start assuming we know how many data elements there are + // (since we do a lazy layer creation thing based on calls to subnet() inside + // test_layer_subnet). + l.setup(subnetwork); + impl::call_layer_forward(l, subnetwork, output); + + resizable_tensor input_grad; + input_grad.copy_size(output); + fill_with_gassuan_random_numbers(input_grad, rnd); + + + // The f() we are computing gradients of is this thing. It's value at the current + // parameter and data values is: + //sout << "f(data,params): " << dot(output, input_grad) << std::endl; + + // We are going to save a copy of the subnetwork.get_gradient_input() data before we do + // backpropagation since the backward() function is supposed to *add* to the + // gradients rather than overwrite them. We will use this saved data to check if + // that is the case. + const unsigned long num_data_inputs = subnetwork.count_outputs(); + std::vector initial_gradient_input(num_data_inputs); + for (unsigned long i = 0; i < num_data_inputs; ++i) + initial_gradient_input[i] = subnetwork.get_gradient_input_element(i); + + + // Now tell the layer to compute all the gradients. In the rest of this function + // we will just be checking that these gradients were computed correctly by + // comparing them to a central differences approximation. + resizable_tensor params_grad; + params_grad.copy_size(l.get_layer_params()); + // But first, set the params grad to something crazy so that it's very obvious if + // it doesn't get fully assigned. + params_grad = std::numeric_limits::infinity(); + impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad); + + static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork), + "Layer not defined correctly. forward and backward methods must either both be in-place or both out-of-place. "); + + // Make sure the outputs of forward() and backward() are the same when they are run + // in in-place mode. + if (impl::is_inplace_layer(l, subnetwork)) + { + test_layer_subnet subnetwork2(rnd, p); + layer_details_type ll(l); + ll.setup(subnetwork2); + resizable_tensor ip_out; + impl::call_layer_forward(ll, subnetwork2, ip_out); + impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output()); + const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output()))); + if (forward_error > 0.00001) + { + sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n"; + sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << std::endl; + return layer_test_results(sout.str()); + } + + resizable_tensor params_grad; + params_grad.copy_size(ll.get_layer_params()); + params_grad = std::numeric_limits::infinity(); + + resizable_tensor input_grad; + input_grad.copy_size(ip_out); + fill_with_gassuan_random_numbers(input_grad, rnd); + resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2; + params_grad1 = params_grad; + params_grad2 = params_grad; + // Now call backward() and make sure it works as well. Recall that when an + // in-place layer works in-place it assigns to it's outputs but when it's + // not running in-place it adds. So we initialize to a non-zero value to + // check that this is the behavior that really executes. + subnetwork2.get_gradient_input() = 9; + impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1); + data_grad1 = subnetwork2.get_gradient_input(); + + subnetwork2.get_gradient_input() = mat(input_grad); + impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2); + data_grad2 = subnetwork2.get_gradient_input(); + if (params_grad.size() != 0) + { + const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2))); + if (backward_param_error > 0.00001) + { + sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n"; + sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << std::endl; + return layer_test_results(sout.str()); + } + } + const auto backward_data_error = max(abs(mat(data_grad1)-9 - mat(data_grad2))); + if (backward_data_error > 0.00001) + { + sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n"; + sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << std::endl; + return layer_test_results(sout.str()); + } + } + + // ================================================================== + // first validate the way the parameter gradients are computed + for (unsigned long i = 0; i < params_grad.size(); ++i) + { + layer_details_type l1(l); + + float eps = l1.get_layer_params().host()[i]*base_eps; + if (eps == 0) + eps = base_eps; + const float oldval = l1.get_layer_params().host()[i]; + l1.get_layer_params().host()[i] = oldval+eps; + impl::call_layer_forward(l1, subnetwork, out2); + l1.get_layer_params().host()[i] = oldval-eps; + impl::call_layer_forward(l1, subnetwork, out3); + l1.get_layer_params().host()[i] = oldval; + + // Compute a reference derivative via a central differences approximation and + // compare it to the one output by the layer and make sure they match. + double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); + double output_derivative = params_grad.host()[i]; + double relative_error; + if (reference_derivative*output_derivative != 0) + relative_error = (reference_derivative - output_derivative)/(reference_derivative); + else + relative_error = (reference_derivative - output_derivative); + double absolute_error = (reference_derivative - output_derivative); + rs_params.add(std::abs(relative_error)); + if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006) + { + sout << "Gradient error in parameter #" << i <<". Relative error: "<< relative_error << std::endl; + sout << "expected derivative: " << reference_derivative << std::endl; + sout << "output derivative: " << output_derivative << std::endl; + sout << "iteration: " << iter << std::endl; + return layer_test_results(sout.str()); + } + } + + // ================================================================== + // now validate the data gradients + for (unsigned long i = 0; i < num_data_inputs; ++i) + { + const float oldval = subnetwork.get_output_element(i); + float eps = oldval*base_eps; + if (eps == 0) + eps = base_eps; + subnetwork.get_output_element(i) = oldval+eps; + impl::call_layer_forward(l, subnetwork, out2); + subnetwork.get_output_element(i) = oldval-eps; + impl::call_layer_forward(l, subnetwork, out3); + subnetwork.get_output_element(i) = oldval; + + // Compute a reference derivative via a central differences approximation and + // compare it to the one output by the layer and make sure they match. + double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); + double output_derivative = subnetwork.get_gradient_input_element(i); + output_derivative -= initial_gradient_input[i]; + double relative_error; + if (reference_derivative*output_derivative != 0) + relative_error = (reference_derivative - output_derivative)/(reference_derivative); + else + relative_error = (reference_derivative - output_derivative); + double absolute_error = (reference_derivative - output_derivative); + rs_data.add(std::abs(relative_error)); + if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006) + { + sout << "Gradient error in data variable #" << i <<". Relative error: "<< relative_error << std::endl; + sout << "expected derivative: " << reference_derivative << std::endl; + sout << "output derivative: " << output_derivative << std::endl; + sout << "iteration: " << iter << std::endl; + return layer_test_results(sout.str()); + } + } + + } // end for (int iter = 0; iter < 10; ++iter) + + if (rs_params.mean() > 0.003) + { + sout << "Average parameter gradient error is somewhat large at: "<< rs_params.mean() << std::endl; + return layer_test_results(sout.str()); + } + if (rs_data.mean() > 0.003) + { + sout << "Average data gradient error is somewhat large at: "<< rs_data.mean() << std::endl; + return layer_test_results(sout.str()); + } + + return layer_test_results(); + } + + template < + typename layer_details_type + > + layer_test_results test_layer( + layer_details_type l + ) + { + // Default behavior: use random dimensions (all zeros in params) + return test_layer(l, timpl::test_layer_params()); + } + + template < + typename layer_details_type + > + layer_test_results test_layer( + layer_details_type l, + const timpl::test_layer_params& params + ) + { + // Try a few different derivative step sizes to see if any work. + for (float base_eps = 0.0001; base_eps < 0.1; base_eps *= 2) + { + auto result = impl_test_layer(l, base_eps, params); + if (result) + return result; + } + // However, if none of the step sizes worked then try this one and probably result + // in returning an error. + return impl_test_layer(l, 0.01, params); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template + struct vl_loop + { + template < + typename net_type, + typename visitor + > + static void visit( + net_type& net, + visitor&& v + ) + { + // Call whatever version of the visitor the user provided. + call_if_valid(v, i, layer(net)); + call_if_valid(v, layer(net)); + vl_loop::visit(net,v); + } + }; + + template + struct vl_loop + { + template < + typename net_type, + typename visitor + > + static void visit( + net_type&, + visitor&& + ) + { + // Base case of recursion. Don't do anything. + } + }; + + template + struct vl_loop_backwards + { + template < + typename net_type, + typename visitor + > + static void visit( + net_type& net, + visitor&& v + ) + { + vl_loop_backwards::visit(net,v); + // Call whatever version of the visitor the user provided. + call_if_valid(v, i, layer(net)); + call_if_valid(v, layer(net)); + } + }; + + template + struct vl_loop_backwards + { + template < + typename net_type, + typename visitor + > + static void visit( + net_type&, + visitor&& + ) + { + // Base case of recursion. Don't do anything. + } + }; + + } + + template < + typename net_type, + typename visitor + > + void visit_layers( + net_type& net, + visitor v + ) + { + impl::vl_loop<0, net_type::num_layers>::visit(net, v); + } + + template < + typename net_type, + typename visitor + > + void visit_layers_backwards( + net_type& net, + visitor v + ) + { + impl::vl_loop_backwards<0, net_type::num_layers>::visit(net, v); + } + + template < + size_t begin, + size_t end, + typename net_type, + typename visitor + > + void visit_layers_range( + net_type& net, + visitor v + ) + { + static_assert(begin <= end, "Invalid range"); + static_assert(end <= net_type::num_layers, "Invalid range"); + impl::vl_loop::visit(net, v); + } + + template < + size_t begin, + size_t end, + typename net_type, + typename visitor + > + void visit_layers_backwards_range( + net_type& net, + visitor v + ) + { + static_assert(begin <= end, "Invalid range"); + static_assert(end <= net_type::num_layers, "Invalid range"); + impl::vl_loop_backwards::visit(net, v); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template + struct vl_until_tag + { + template < + typename net_type, + typename next_net_type, + typename visitor + > + static void visit( + net_type& net, + next_net_type& next_net, + visitor&& v + ) + { + call_if_valid(v, next_net); + vl_until_tag::visit(net,layer(net),v); + } + + template < + typename net_type, + typename SUBNET, + typename visitor + > + static void visit( + net_type&, + const add_tag_layer& next_net, + visitor&& v + ) + { + call_if_valid(v, next_net); + } + + template < + typename net_type, + typename SUBNET, + typename visitor + > + static void visit( + net_type&, + add_tag_layer& next_net, + visitor&& v + ) + { + call_if_valid(v, next_net); + } + }; + } + + template < + unsigned long tag_id, + typename net_type, + typename visitor + > + void visit_layers_until_tag( + net_type& net, + visitor v + ) + { + impl::vl_until_tag<0,tag_id>::visit(net, net, v); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template < + typename visitor + > + class visitor_computational_layer + { + public: + explicit visitor_computational_layer(visitor& v) : v_(v) {} + + template + void do_visit(size_t idx, layer& l) const + { + // Call whatever version of the visitor the user provided. + call_if_valid(v_, idx, l.layer_details()); + call_if_valid(v_, l.layer_details()); + } + + // const case + template + void operator()(size_t idx, const add_layer& l) const { do_visit(idx, l); } + // non-const cast + template + void operator()(size_t idx, add_layer& l) const { do_visit(idx, l); } + + private: + + visitor& v_; + }; + } + + template < + typename net_type, + typename visitor + > + void visit_computational_layers( + net_type& net, + visitor v + ) + { + visit_layers(net, impl::visitor_computational_layer(v)); + } + + template < + size_t begin, + size_t end, + typename net_type, + typename visitor + > + void visit_computational_layers_range( + net_type& net, + visitor v + ) + { + visit_layers_range(net, impl::visitor_computational_layer(v)); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template < + typename visitor + > + class visit_layer_parameters + { + public: + explicit visit_layer_parameters(visitor& v) : v_(v) {} + + template + void operator()(layer& l) + { + // Call whatever version of the visitor the user provided. + const bool visitor_called = call_if_valid(v_, computational_layer_idx, l.get_layer_params()) || + call_if_valid(v_, l.get_layer_params()); + DLIB_CASSERT(visitor_called, "A visitor function with an incorrect signature was given to visit_layer_parameters()"); + ++computational_layer_idx; + } + private: + + size_t computational_layer_idx = 0; + visitor& v_; + }; + } + + template < + typename net_type, + typename visitor + > + void visit_layer_parameters( + net_type& net, + visitor v + ) + { + visit_computational_layers(net, impl::visit_layer_parameters(v)); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template < + typename visitor + > + class visit_layer_parameter_gradients + { + public: + explicit visit_layer_parameter_gradients(visitor& v) : v_(v) {} + + template + void do_visit(layer& l) + { + // Call whatever version of the visitor the user provided. + const bool visitor_called = call_if_valid(v_, computational_layer_idx, l.get_parameter_gradient()) || + call_if_valid(v_, l.get_parameter_gradient()); + DLIB_CASSERT(visitor_called, "A visitor function with an incorrect signature was given to visit_layer_parameter_gradients()"); + ++computational_layer_idx; + } + + // const version + template + void operator()(const add_layer& l) { do_visit(l); } + // non-const version + template + void operator()(add_layer& l) { do_visit(l); } + + private: + + size_t computational_layer_idx = 0; + visitor& v_; + }; + } + + template < + typename net_type, + typename visitor + > + void visit_layer_parameter_gradients( + net_type& net, + visitor v + ) + { + visit_layers(net, impl::visit_layer_parameter_gradients(v)); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_CORE_H_ + + diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h index f7985fcc32..d7c8889636 100644 --- a/dlib/dnn/layers.h +++ b/dlib/dnn/layers.h @@ -1017,19 +1017,10 @@ namespace dlib void setup(const SUBNET& sub) { const auto& input = sub.get_output(); - input_k = input.k(); - input_nr = input.nr(); - input_nc = input.nc(); - - // Calculate output dimensions using input dims where target is -1 - if (k_ == -1) output_k = input_k; - if (nr_ == -1) output_nr = input_nr; - if (nc_ == -1) output_nc = input_nc; + update_dimensions_from_input(input); - // Check if this is well a pure reshape long input_elements = input_k * input_nr * input_nc; long output_elements = output_k * output_nr * output_nc; - if (input_elements != output_elements && input_k == output_k) needs_rescale = true; DLIB_CASSERT(input_elements == output_elements || needs_rescale, "Cannot reshape tensor of " << input_elements << " elements into shape with " << output_elements << " elements. " << @@ -1039,8 +1030,14 @@ namespace dlib template void forward(const SUBNET& sub, resizable_tensor& output) { - // Set the output size (always preserving batch dimension) const tensor& input = sub.get_output(); + + // Check if dimensions changed (after deserialization or fine-tuning) + // This ensures dimensions are always synchronized with current input + if (input_k != input.k() || input_nr != input.nr() || input_nc != input.nc()) + update_dimensions_from_input(input); + + // Set the output size (always preserving batch dimension) output.set_size(input.num_samples(), output_k, output_nr, output_nc); if (!needs_rescale) @@ -1142,7 +1139,25 @@ namespace dlib << "/>\n"; } - private: + private: + void update_dimensions_from_input(const tensor& input) + { + // Update input dimensions + input_k = input.k(); + input_nr = input.nr(); + input_nc = input.nc(); + + // Recalculate output dimensions for dynamic axes (-1) + if (k_ == -1) output_k = input_k; + if (nr_ == -1) output_nr = input_nr; + if (nc_ == -1) output_nc = input_nc; + + // Check if rescaling is needed + long input_elements = input_k * input_nr * input_nc; + long output_elements = output_k * output_nr * output_nc; + needs_rescale = (input_elements != output_elements && input_k == output_k); + } + long input_k, input_nr, input_nc; // Input dimensions long output_k, output_nr, output_nc; // Output dimensions bool needs_rescale; @@ -2407,7 +2422,7 @@ namespace dlib { const auto& prev_output = sub.get_output(); DLIB_CASSERT((long)num_inputs == prev_output.nc(), - "The size of the input tensor to this linear layer doesn't match the size the linear layer was trained with."); + "The size of the input tensor to this linear layer doesn't match the size the linear layer was trained with."); output.set_size(prev_output.num_samples(), prev_output.k(), prev_output.nr(), num_outputs); auto o = alias_tensor(output.num_samples() * output.k() * output.nr(), num_outputs)(output, 0); @@ -2441,8 +2456,6 @@ namespace dlib } } - //prev_gradient is not const, so that sgi isn't const - //since sgi is used as a destination for tt::gemm auto& prev_gradient = sub.get_gradient_input(); alias_tensor_instance sgi = alias_tensor(prev_gradient.num_samples() * prev_gradient.k() * prev_gradient.nr(), num_inputs)(prev_gradient, 0); auto w = weights(params, 0); @@ -5441,7 +5454,8 @@ namespace dlib embeddings_() : num_embeddings(num_embeddings_), embedding_dim(embedding_dim_), learning_rate_multiplier(1.0f), - scale_by_freq(true) + scale_by_freq(true), + output_scale(std::sqrt(static_cast(embedding_dim_))) { } @@ -5473,12 +5487,17 @@ namespace dlib } } + float get_output_scale() const { return output_scale; } + template void setup(const SUBNET& /*sub*/) { embs.set_size(num_embeddings, embedding_dim); tt::tensor_rand rnd(std::rand()); rnd.fill_gaussian(embs); + + const float init_scale = 1.0f / std::sqrt(static_cast(embedding_dim)); + tt::affine_transform(embs, embs, init_scale); } template @@ -5488,6 +5507,7 @@ namespace dlib output.set_size(prev.num_samples(), prev.k(), prev.nr(), embedding_dim); tt::embeddings(output, prev, embs); + tt::affine_transform(output, output, output_scale); } template @@ -5502,7 +5522,8 @@ namespace dlib auto& prev_src = sub.get_output(); calc_token_freqs(prev_src, gradient_input); - tt::embeddings_gradient(prev_src, gradient_input, embs, freqs, learning_rate_multiplier, scale_by_freq); + const float scaled_lr = learning_rate_multiplier * output_scale; + tt::embeddings_gradient(prev_src, gradient_input, embs, freqs, scaled_lr, scale_by_freq); } } @@ -5520,6 +5541,7 @@ namespace dlib serialize(item.embedding_dim, out); serialize(item.learning_rate_multiplier, out); serialize(item.scale_by_freq, out); + serialize(item.output_scale, out); } friend void deserialize(embeddings_& item, std::istream& in) { @@ -5532,12 +5554,14 @@ namespace dlib deserialize(item.embedding_dim, in); deserialize(item.learning_rate_multiplier, in); deserialize(item.scale_by_freq, in); + deserialize(item.output_scale, in); } friend std::ostream& operator<<(std::ostream& out, const embeddings_& item) { out << "embeddings (num_embeddings=" << item.num_embeddings << ", embedding_dim=" << item.embedding_dim + << ", scale=" << item.output_scale << ") learning_rate_mult=" << item.learning_rate_multiplier; return out; } @@ -5545,6 +5569,7 @@ namespace dlib { out << "\n"; out << mat(item.embs); @@ -5576,6 +5601,7 @@ namespace dlib unsigned long num_embeddings, embedding_dim; double learning_rate_multiplier; bool scale_by_freq; + float output_scale; }; template < @@ -5587,6 +5613,113 @@ namespace dlib // ---------------------------------------------------------------------------------------- + class tril_padding_context + { + public: + static void set(const tensor& input_tokens, long padding_token) + { + if (padding_token < 0) { + clear(); + return; + } + std::lock_guard lock(get_mutex_()); + const long batch_size = input_tokens.num_samples(); + const long seq_len = input_tokens.nr(); + const float* data = input_tokens.host(); + get_padding_lengths_().resize(batch_size); + for (long s = 0; s < batch_size; ++s) + { + long count = 0; + for (long t = 0; t < seq_len; ++t) + { + const long idx = s * seq_len + t; + const long token = static_cast(data[idx]); + if (token == padding_token) + count++; + else + break; + } + get_padding_lengths_()[s] = count; + } + get_is_set_() = true; + } + + static void set_from_lengths(const std::vector& lengths) + { + std::lock_guard lock(get_mutex_()); + get_padding_lengths_() = lengths; + get_is_set_() = true; + } + + static void set_uniform(long padding_length, long batch_size) + { + std::lock_guard lock(get_mutex_()); + get_padding_lengths_().assign(batch_size, padding_length); + get_is_set_() = true; + } + + static void clear() + { + std::lock_guard lock(get_mutex_()); + get_padding_lengths_().clear(); + get_is_set_() = false; + } + + static long get_padding_length(long sample_idx) + { + std::lock_guard lock(get_mutex_()); + if (!get_is_set_() || sample_idx < 0 || + sample_idx >= static_cast(get_padding_lengths_().size())) + return 0; + return get_padding_lengths_()[sample_idx]; + } + + static std::vector get_all_lengths() + { + std::lock_guard lock(get_mutex_()); + return get_padding_lengths_(); + } + + static bool is_set() + { + std::lock_guard lock(get_mutex_()); + return get_is_set_(); + } + + private: + static std::mutex& get_mutex_() + { + static std::mutex m; + return m; + } + + static std::vector& get_padding_lengths_() + { + static std::vector lengths; + return lengths; + } + + static bool& get_is_set_() + { + static bool is_set = false; + return is_set; + } + }; + + template + long count_leading_padding(const matrix& seq, T padding_token) + { + long count = 0; + for (long i = 0; i < seq.size(); ++i) + { + if (seq(i) == padding_token) count++; + else break; + } + return count; + } + +// ---------------------------------------------------------------------------------------- + struct neg_infinity_tag {}; struct zero_tag {}; @@ -5601,7 +5734,7 @@ namespace dlib class tril_ { public: - tril_(): diag(diag_), diag_value(compute_diag_value()) {} + tril_(): diag(diag_), prefix_size(0), diag_value(compute_diag_value()) {} template void setup(const SUBNET& /*sub*/) @@ -5614,10 +5747,28 @@ namespace dlib auto& prev = sub.get_output(); output.set_size(prev.num_samples(), prev.k(), prev.nr(), prev.nc()); + // Check padding context and update cached lengths if needed + if (tril_padding_context::is_set()) + { + auto new_lengths = tril_padding_context::get_all_lengths(); + if (new_lengths != cached_padding_lengths_) + { + cached_padding_lengths_ = new_lengths; + invalidate_mask(); + } + } + else if (!cached_padding_lengths_.empty()) + { + // Context was cleared, reset padding + cached_padding_lengths_.clear(); + invalidate_mask(); + } + check_mask(prev); tt::multiply(false, output, prev, binary_mask); if (diag_value != 0.0f) tt::add(1, output, 1, output_mask); } + template void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) { @@ -5630,6 +5781,15 @@ namespace dlib const tensor& get_layer_params() const { return params; } tensor& get_layer_params() { return params; } + + void set_prefix_size(long n_prefix_size) + { + if (prefix_size != n_prefix_size) { + prefix_size = n_prefix_size; + invalidate_mask(); + } + } + long get_prefix_size() const { return prefix_size; } friend void serialize(const tril_& item, std::ostream& out) { @@ -5667,25 +5827,66 @@ namespace dlib return static_cast(num_) / static_cast(den_); } + void invalidate_mask() + { + binary_mask.set_size(0, 0, 0, 0); + output_mask.set_size(0, 0, 0, 0); + } + void check_mask(const tensor& t) { - if (!have_same_dimensions(binary_mask, t)) { + if (!have_same_dimensions(binary_mask, t)) + { binary_mask.copy_size(t); binary_mask = 1; - if (diag_value != 0.0f) { + + const bool use_output_mask = (diag_value != 0.0f); + if (use_output_mask) { output_mask.copy_size(t); output_mask = 0; - } - for (long s = 0; s < output_mask.num_samples(); ++s) + } + + const bool has_padding = !cached_padding_lengths_.empty(); + + for (long s = 0; s < t.num_samples(); ++s) { - for (long k = 0; k < output_mask.k(); ++k) + const long pad_len = has_padding && + s < static_cast(cached_padding_lengths_.size()) + ? cached_padding_lengths_[s] : 0; + + for (long k = 0; k < t.k(); ++k) { - for (long r = 0; r < output_mask.nr(); ++r) + for (long r = 0; r < t.nr(); ++r) { - for (long c = std::max(r + diag + 1, 0L); c < output_mask.nc(); ++c) + // Mask padding columns + for (long c = 0; c < pad_len; ++c) + { + const long idx = tensor_index(t, s, k, r, c); + binary_mask.host()[idx] = 0; + if (use_output_mask) + output_mask.host()[idx] = diag_value; + } + + // Mask future positions (causal) + const long causal_start = std::max({ r + diag + 1, prefix_size, pad_len }); + for (long c = causal_start; c < t.nc(); ++c) { - if (diag_value != 0.0f) output_mask.host()[tensor_index(output_mask, s, k, r, c)] = diag_value; - binary_mask.host()[tensor_index(binary_mask, s, k, r, c)] = 0; + const long idx = tensor_index(t, s, k, r, c); + binary_mask.host()[idx] = 0; + if (use_output_mask) + output_mask.host()[idx] = diag_value; + } + + // Mask padding rows + if (r < pad_len) + { + for (long c = 0; c < t.nc(); ++c) + { + const long idx = tensor_index(t, s, k, r, c); + binary_mask.host()[idx] = 0; + if (use_output_mask) + output_mask.host()[idx] = diag_value; + } } } } @@ -5699,7 +5900,9 @@ namespace dlib resizable_tensor params; // unused resizable_tensor binary_mask, output_mask; long diag; + long prefix_size; float diag_value; + std::vector cached_padding_lengths_; }; template @@ -5742,8 +5945,7 @@ namespace dlib num_channels_(item.num_channels_), feature_dim_(item.feature_dim_), ponder_cost_(item.ponder_cost_), - avg_steps_(item.avg_steps_), - params(item.params), + avg_steps_(item.avg_steps_), halting_probs_(item.halting_probs_), cumulative_halting_(item.cumulative_halting_), remainders_(item.remainders_), @@ -5751,7 +5953,8 @@ namespace dlib logits_(item.logits_), grad_logits_(item.grad_logits_), input_cache_(item.input_cache_), - true_effective_weights_(item.true_effective_weights_) + true_effective_weights_(item.true_effective_weights_), + params(item.params) { } @@ -5770,8 +5973,7 @@ namespace dlib num_channels_ = item.num_channels_; feature_dim_ = item.feature_dim_; ponder_cost_ = item.ponder_cost_; - avg_steps_ = item.avg_steps_; - params = item.params; + avg_steps_ = item.avg_steps_; halting_probs_ = item.halting_probs_; cumulative_halting_ = item.cumulative_halting_; remainders_ = item.remainders_; @@ -5780,6 +5982,7 @@ namespace dlib grad_logits_ = item.grad_logits_; input_cache_ = item.input_cache_; true_effective_weights_ = item.true_effective_weights_; + params = item.params; return *this; } @@ -6051,9 +6254,6 @@ namespace dlib long num_channels_; long feature_dim_; - // Learnable parameters - resizable_tensor params; - // Working memory resizable_tensor halting_probs_; // p_t^n: Halting probabilities resizable_tensor cumulative_halting_; // h_t^n: Cumulative halting probabilities @@ -6067,6 +6267,9 @@ namespace dlib // Statistics for monitoring float ponder_cost_; // R(x): Current ponder cost float avg_steps_; // Average number of computation steps + + // Learnable parameters + resizable_tensor params; }; template @@ -6081,6 +6284,808 @@ namespace dlib template using act16 = add_layer, SUBNET>; // Deep version +// ---------------------------------------------------------------------------------------- + + // YaRN configuration structure + struct yarn_config + { + // Alpha controls overall intensity of scaling (typical ~1.0) + float alpha = 1.0f; + + // Beta controls curvature of scaling across head dimensions (typical 0.25..0.5) + float beta = 0.5f; + + // original_len is the context length used at training time + // If 0, it will be set to the first seq_len observed (common pattern) + long original_len = 0; + + // Enable/disable YaRN; if false, behavior is identical to classical RoPE + bool enabled = true; + }; + + class rotary_positional_embedding_ + { + public: + explicit rotary_positional_embedding_() : + seq_len(0), + d_head(0), + theta_base(10000.0f) + { + } + + rotary_positional_embedding_(const rotary_positional_embedding_& other) : + seq_len(other.seq_len), + d_head(other.d_head), + theta_base(other.theta_base), + cos_cache(other.cos_cache), + sin_cache(other.sin_cache), + yarn(other.yarn) + { + } + + rotary_positional_embedding_& operator=(const rotary_positional_embedding_& other) + { + if (this != &other) { + seq_len = other.seq_len; + d_head = other.d_head; + theta_base = other.theta_base; + cos_cache = other.cos_cache; + sin_cache = other.sin_cache; + yarn = other.yarn; + } + return *this; + } + + // Set base used to compute inverse frequencies (theta base > 0) + void set_theta_base(float base) + { + DLIB_CASSERT(base > 0, "Theta base must be positive"); + theta_base = base; + } + + float get_theta_base() const { return theta_base; } + long get_seq_len() const { return seq_len; } + long get_d_head() const { return d_head; } + + // Configure YaRN hyperparameters + void set_yarn_params(float alpha, float beta, long original_len = 0, bool enabled = true) + { + DLIB_CASSERT(alpha >= 0, "alpha must be non-negative"); + DLIB_CASSERT(beta >= 0, "beta must be non-negative"); + yarn.alpha = alpha; + yarn.beta = beta; + yarn.original_len = original_len; + yarn.enabled = enabled; + } + const yarn_config& get_yarn_config() const { return yarn; } + + template + void setup(const SUBNET& sub) + { + const tensor& input = sub.get_output(); + + // Expected input shape: (batch, num_heads, seq_len, d_head) + seq_len = input.nr(); + d_head = input.nc(); + + DLIB_CASSERT(d_head >= 2, "d_head must be at least 2 for rotation"); + DLIB_CASSERT(seq_len > 0, "seq_len must be positive"); + + // If original_len not set, treat the setup seq_len as the model's training length + if (yarn.original_len == 0) yarn.original_len = seq_len; + + // Precompute rotation angles and trigonometric values + compute_and_cache_trig_values(seq_len); + } + + template + void forward(const SUBNET& sub, resizable_tensor& output) + { + const tensor& input = sub.get_output(); + + // Validate shape; we expect shape (batch, num_heads, seq_len, d_head) + const long in_seq_len = input.nr(); + const long in_d_head = input.nc(); + + DLIB_CASSERT(in_d_head >= 2, "d_head must be at least 2 for rotation"); + DLIB_CASSERT(in_seq_len > 0, "seq_len must be positive"); + + // If setup() was not called or the incoming sequence length changed from + // the cached seq_len (e.g. inference with a different context window), + // recompute trig caches for the current seq_len. + if (seq_len != in_seq_len || d_head != in_d_head + || cos_cache.size() == 0 || sin_cache.size() == 0) + { + // If we don't have a recorded original_len yet, set it here (first observed seq_len) + if (yarn.original_len == 0) yarn.original_len = in_seq_len; + + // Update internal dimensions and recompute caches targeted to in_seq_len + seq_len = in_seq_len; + d_head = in_d_head; + compute_and_cache_trig_values(seq_len); + } + + output.copy_size(input); + + // Copy input to output + tt::copy_tensor(false, output, 0, input, 0, input.k()); + + // Apply rotary embedding in-place + tt::apply_rotary_positional_embedding( + false, // forward pass + output, + cos_cache, + sin_cache + ); + } + + template + void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + tensor& prev_grad = sub.get_gradient_input(); + + // Apply inverse rotation to gradients + resizable_tensor grad_output; + grad_output.copy_size(gradient_input); + tt::copy_tensor(false, grad_output, 0, gradient_input, 0, gradient_input.k()); + + tt::apply_rotary_positional_embedding( + true, // backward pass (inverse rotation) + grad_output, + cos_cache, + sin_cache + ); + + // Accumulate gradients + tt::copy_tensor(true, prev_grad, 0, grad_output, 0, grad_output.k()); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const rotary_positional_embedding_& item, std::ostream& out) + { + serialize("rope_", out); + serialize(item.theta_base, out); + serialize(item.cos_cache, out); + serialize(item.sin_cache, out); + + // yarn config + serialize(item.yarn.alpha, out); + serialize(item.yarn.beta, out); + serialize(item.yarn.original_len, out); + serialize(item.yarn.enabled, out); + } + + friend void deserialize(rotary_positional_embedding_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "rope_") + throw serialization_error("Unexpected version '" + version + + "' while deserializing rope_"); + + deserialize(item.theta_base, in); + deserialize(item.cos_cache, in); + deserialize(item.sin_cache, in); + + // yarn config + deserialize(item.yarn.alpha, in); + deserialize(item.yarn.beta, in); + deserialize(item.yarn.original_len, in); + deserialize(item.yarn.enabled, in); + } + + friend std::ostream& operator<<(std::ostream& out, const rotary_positional_embedding_& item) + { + out << "rope (theta_base=" << item.theta_base + << ", yarn.alpha=" << item.yarn.alpha + << ", yarn.beta=" << item.yarn.beta + << ", yarn.original_len=" << item.yarn.original_len + << ", yarn.enabled=" << (item.yarn.enabled ? "true" : "false") + << ")"; + return out; + } + + friend void to_xml(const rotary_positional_embedding_& item, std::ostream& out) + { + out << "\n"; + } + + inline dpoint map_input_to_output(const dpoint& p) const { return p; } + inline dpoint map_output_to_input(const dpoint& p) const { return p; } + + private: + // Compute and cache cosine/sine tables for target_seq_len + // This function uses YaRN scaling when yarn.enabled is true + void compute_and_cache_trig_values(long target_seq_len) + { + if (seq_len == 0 || d_head == 0) return; + + // Half the head dimension (we rotate pairs) + const long half_dim = d_head / 2; + + // Allocate cache tensors: shape (1, 1, seq_len, half_dim) + cos_cache.set_size(1, 1, seq_len, half_dim); + sin_cache.set_size(1, 1, seq_len, half_dim); + + // Compute on host side + float* cos_ptr = cos_cache.host(); + float* sin_ptr = sin_cache.host(); + + // Precompute inv_freq constant per dimension (independent of position) + // inv_freq_i = theta_base^(-2i/d_head) + std::vector inv_freq(half_dim); + for (long i = 0; i < half_dim; ++i) + inv_freq[i] = std::pow(theta_base, -2.0f * i / static_cast(d_head)); + + // Determine the training length to use for YaRN scaling + const long train_len = (yarn.original_len > 0) ? yarn.original_len : target_seq_len; + + // Compute cos/sin for each position and frequency index, using YaRN if enabled + for (long pos = 0; pos < target_seq_len; ++pos) + { + for (long i = 0; i < half_dim; ++i) + { + // Base angle: pos * inv_freq[i] + float pos_scaled = static_cast(pos); + + if (yarn.enabled) + { + // Compute dimension-normalized index in [0,1] + const float dim_norm = static_cast(i) / static_cast(half_dim); + + // exponent = alpha * dim_norm^beta + // Note: we use half_dim for normalization so higher-frequency dims get smaller exponent + const float exponent = yarn.alpha * std::pow(dim_norm, yarn.beta); + + // scale = (target_len / train_len)^exponent + // This allows small-dim (low freq) to scale less than high-dim if desired + const float ratio = static_cast(target_seq_len) / static_cast(train_len); + const float scale = std::pow(ratio, exponent); + + // Scaled position used to compute the angle + pos_scaled = static_cast(pos) * scale; + } + + const float angle = pos_scaled * inv_freq[i]; + + const long idx = pos * half_dim + i; + cos_ptr[idx] = std::cos(angle); + sin_ptr[idx] = std::sin(angle); + } + } + } + + // Configuration + long seq_len; + long d_head; + float theta_base; + + // Precomputed trigonometric values + // Shape: (1, 1, seq_len, d_head/2) + resizable_tensor cos_cache; + resizable_tensor sin_cache; + + // YaRN configuration + yarn_config yarn; + + // No trainable parameters + resizable_tensor params; + }; + + template + using rope = add_layer; + +// ---------------------------------------------------------------------------------------- + + template < + long patch_size_, + long embedding_dim_, + long use_class_token_, + long use_position_embeddings_ + > + class patch_embeddings_ + { + static_assert(patch_size_ > 0, "Patch size must be positive"); + static_assert(embedding_dim_ > 0, "Embedding dimension must be positive"); + static_assert(use_class_token_ == 0 || use_class_token_ == 1, + "use_class_token must be 0 or 1"); + static_assert(use_position_embeddings_ == 0 || use_position_embeddings_ == 1, + "use_position_embeddings must be 0 or 1"); + + public: + + patch_embeddings_() : + in_channels(0), + num_patches_h(0), + num_patches_w(0), + cached_input_h(0), + cached_input_w(0), + cached_input_k(0), + learning_rate_multiplier(1.0) + { + } + + patch_embeddings_(const patch_embeddings_& other) : + in_channels(other.in_channels), + num_patches_h(other.num_patches_h), + num_patches_w(other.num_patches_w), + cached_input_h(other.cached_input_h), + cached_input_w(other.cached_input_w), + cached_input_k(other.cached_input_k), + learning_rate_multiplier(other.learning_rate_multiplier), + params(other.params), + filters_alias(other.filters_alias), + biases_alias(other.biases_alias), + pos_embed_alias(other.pos_embed_alias), + cls_token_alias(other.cls_token_alias) + { + } + + patch_embeddings_& operator=(const patch_embeddings_& other) + { + if (this != &other) { + in_channels = other.in_channels; + num_patches_h = other.num_patches_h; + num_patches_w = other.num_patches_w; + cached_input_h = other.cached_input_h; + cached_input_w = other.cached_input_w; + cached_input_k = other.cached_input_k; + learning_rate_multiplier = other.learning_rate_multiplier; + params = other.params; + filters_alias = other.filters_alias; + biases_alias = other.biases_alias; + pos_embed_alias = other.pos_embed_alias; + cls_token_alias = other.cls_token_alias; + // Note: conv_op is non-copyable and stateless, will be re-setup on forward() + } + return *this; + } + + long get_patch_size() const { return patch_size_; } + long get_embedding_dim() const { return embedding_dim_; } + long uses_class_token() const { return use_class_token_; } + long uses_position_embeddings() const { return use_position_embeddings_; } + long get_num_patches() const { return num_patches_h * num_patches_w; } + + double get_learning_rate_multiplier() const { return learning_rate_multiplier; } + void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; } + + template + void setup(const SUBNET& sub) + { + const tensor& input = sub.get_output(); + in_channels = input.k(); + + DLIB_CASSERT(input.nr() % patch_size_ == 0, + "Image height must be divisible by patch size. Got height=" << input.nr() + << ", patch_size=" << patch_size_); + DLIB_CASSERT(input.nc() % patch_size_ == 0, + "Image width must be divisible by patch size. Got width=" << input.nc() + << ", patch_size=" << patch_size_); + + num_patches_h = input.nr() / patch_size_; + num_patches_w = input.nc() / patch_size_; + const long num_patches = num_patches_h * num_patches_w; + const long sequence_length = num_patches + use_class_token_; + + // Calculate total parameter size: + // - projection_filters: embedding_dim * in_channels * patch_size * patch_size + // - projection_biases: embedding_dim + // - position_embeddings (optional): sequence_length * embedding_dim + // - class_token (optional): embedding_dim + const long filter_size = embedding_dim_ * in_channels * patch_size_ * patch_size_; + const long bias_size = embedding_dim_; + const long pos_embed_size = use_position_embeddings_ ? sequence_length * embedding_dim_ : 0; + const long cls_token_size = use_class_token_ ? embedding_dim_ : 0; + const long total_params = filter_size + bias_size + pos_embed_size + cls_token_size; + + // Allocate all parameters in a single contiguous tensor + params.set_size(total_params); + + // Setup alias tensors for accessing parameter regions + filters_alias = alias_tensor(embedding_dim_, in_channels, patch_size_, patch_size_); + biases_alias = alias_tensor(1, embedding_dim_, 1, 1); + + if (use_position_embeddings_) { + pos_embed_alias = alias_tensor(1, 1, sequence_length, embedding_dim_); + } + if (use_class_token_) { + cls_token_alias = alias_tensor(1, 1, 1, embedding_dim_); + } + + // Initialize parameters with Xavier/Glorot for filters + tt::tensor_rand rnd; + const float fan_in = static_cast(in_channels * patch_size_ * patch_size_); + const float fan_out = static_cast(embedding_dim_); + const float xavier_stddev = std::sqrt(2.0f / (fan_in + fan_out)); + + // Initialize filter weights + auto filt = filters_alias(params, 0); + rnd.fill_gaussian(filt, 0.0f, xavier_stddev); + + // Initialize biases to zero + auto bias = biases_alias(params, filters_alias.size()); + bias = 0; + + // Initialize position embeddings if enabled + if (use_position_embeddings_) { + auto pos = pos_embed_alias(params, filters_alias.size() + biases_alias.size()); + rnd.fill_gaussian(pos, 0.0f, 0.02f); + } + + // Initialize class token if enabled + if (use_class_token_) { + long cls_offset = filters_alias.size() + biases_alias.size(); + if (use_position_embeddings_) cls_offset += pos_embed_alias.size(); + auto cls = cls_token_alias(params, cls_offset); + rnd.fill_gaussian(cls, 0.0f, 0.02f); + } + + // Cache input dimensions and setup convolution + cached_input_h = input.nr(); + cached_input_w = input.nc(); + cached_input_k = input.k(); + conv_op.setup(input, filt, patch_size_, patch_size_, 0, 0); + } + + template + void forward(const SUBNET& sub, resizable_tensor& output) + { + const tensor& input = sub.get_output(); + const long batch_size = input.num_samples(); + + // Re-setup convolution if input spatial dimensions changed + if (input.nr() != cached_input_h || + input.nc() != cached_input_w || + input.k() != cached_input_k || + params.size() == 0) + { + DLIB_CASSERT(input.nr() % patch_size_ == 0, + "Image height must be divisible by patch size. Got height=" << input.nr() + << ", patch_size=" << patch_size_); + DLIB_CASSERT(input.nc() % patch_size_ == 0, + "Image width must be divisible by patch size. Got width=" << input.nc() + << ", patch_size=" << patch_size_); + + cached_input_h = input.nr(); + cached_input_w = input.nc(); + cached_input_k = input.k(); + num_patches_h = input.nr() / patch_size_; + num_patches_w = input.nc() / patch_size_; + } + + const long num_patches = num_patches_h * num_patches_w; + const long sequence_length = num_patches + use_class_token_; + + // Get parameter aliases + auto filt = filters_alias(params, 0); + auto bias = biases_alias(params, filters_alias.size()); + conv_op.setup(input, filt, patch_size_, patch_size_, 0, 0); + + // Step 1: apply convolution (patch extraction + projection) + conv_output.set_size(batch_size, embedding_dim_, num_patches_h, num_patches_w); + conv_op(false, conv_output, input, filt); + + // Add bias using broadcasting + tt::add(1.0f, conv_output, 1.0f, bias); + + // Step 2: reshape from (batch, embed, H/P, W/P) to (batch, 1, num_patches, embed) + patch_sequence.set_size(batch_size, 1, num_patches, embedding_dim_); + reshape_conv_to_sequence(conv_output, patch_sequence); + + // Step 3: prepend class token if enabled + if (use_class_token_) { + long cls_offset = filters_alias.size() + biases_alias.size(); + if (use_position_embeddings_) cls_offset += pos_embed_alias.size(); + auto cls = cls_token_alias(params, cls_offset); + + output.set_size(batch_size, 1, sequence_length, embedding_dim_); + prepend_class_token(patch_sequence, cls, output); + } + else { + output.copy_size(patch_sequence); + tt::copy_tensor(false, output, 0, patch_sequence, 0, patch_sequence.k()); + } + + // Step 4: add position embeddings if enabled + if (use_position_embeddings_) { + auto pos = pos_embed_alias(params, filters_alias.size() + biases_alias.size()); + tt::add(1.0f, output, 1.0f, pos); + } + } + + template + void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) + { + const long batch_size = gradient_input.num_samples(); + const long num_patches = num_patches_h * num_patches_w; + + // Get parameter aliases from params + auto filt = filters_alias(params, 0); + + // Get gradient aliases from params_grad + auto filt_grad = filters_alias(params_grad, 0); + auto bias_grad = biases_alias(params_grad, filters_alias.size()); + + // Step 1: gradient for position embeddings (if enabled) + if (use_position_embeddings_) { + auto pos_grad = pos_embed_alias(params_grad, filters_alias.size() + biases_alias.size()); + // Zero out and accumulate across batch + pos_grad = 0; + sum_across_batch_to_alias(gradient_input, pos_grad); + tt::affine_transform(pos_grad, pos_grad, static_cast(learning_rate_multiplier)); + } + + // Step 2: split gradient between class token and patches + grad_patch_sequence.set_size(batch_size, 1, num_patches, embedding_dim_); + + if (use_class_token_) { + long cls_offset = filters_alias.size() + biases_alias.size(); + if (use_position_embeddings_) cls_offset += pos_embed_alias.size(); + auto cls_grad = cls_token_alias(params_grad, cls_offset); + + cls_grad = 0; + split_class_token_gradient_to_alias(gradient_input, cls_grad, grad_patch_sequence); + tt::affine_transform(cls_grad, cls_grad, static_cast(learning_rate_multiplier)); + } + else { + tt::copy_tensor(false, grad_patch_sequence, 0, gradient_input, 0, gradient_input.k()); + } + + // Step 3: reshape gradient from sequence back to spatial format + grad_conv_output.set_size(batch_size, embedding_dim_, num_patches_h, num_patches_w); + reshape_sequence_to_conv(grad_patch_sequence, grad_conv_output); + + // Step 4: gradient for projection bias + bias_grad = 0; + tt::assign_conv_bias_gradient(bias_grad, grad_conv_output); + tt::affine_transform(bias_grad, bias_grad, static_cast(learning_rate_multiplier)); + + // Step 5: gradient for convolution filters + const tensor& input = sub.get_output(); + filt_grad = 0; + conv_op.get_gradient_for_filters(false, grad_conv_output, input, filt_grad); + tt::affine_transform(filt_grad, filt_grad, static_cast(learning_rate_multiplier)); + + // Step 6: gradient for input (accumulate) + tensor& grad_input = sub.get_gradient_input(); + conv_op.get_gradient_for_data(true, grad_conv_output, filt, grad_input); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const patch_embeddings_& item, std::ostream& out) + { + serialize("patch_embeddings_", out); + serialize(item.in_channels, out); + serialize(item.num_patches_h, out); + serialize(item.num_patches_w, out); + serialize(item.cached_input_h, out); + serialize(item.cached_input_w, out); + serialize(item.cached_input_k, out); + serialize(item.learning_rate_multiplier, out); + serialize(item.params, out); + serialize(item.filters_alias, out); + serialize(item.biases_alias, out); + if (use_position_embeddings_) + serialize(item.pos_embed_alias, out); + if (use_class_token_) + serialize(item.cls_token_alias, out); + } + + friend void deserialize(patch_embeddings_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "patch_embeddings_") + throw serialization_error("Unexpected version '" + version + + "' found while deserializing patch_embeddings_."); + + deserialize(item.in_channels, in); + deserialize(item.num_patches_h, in); + deserialize(item.num_patches_w, in); + deserialize(item.cached_input_h, in); + deserialize(item.cached_input_w, in); + deserialize(item.cached_input_k, in); + deserialize(item.learning_rate_multiplier, in); + deserialize(item.params, in); + deserialize(item.filters_alias, in); + deserialize(item.biases_alias, in); + if (use_position_embeddings_) + deserialize(item.pos_embed_alias, in); + if (use_class_token_) + deserialize(item.cls_token_alias, in); + } + + friend std::ostream& operator<<(std::ostream& out, const patch_embeddings_& item) + { + out << "patch_embeddings (patch_size=" << patch_size_ + << ", embedding_dim=" << embedding_dim_ + << ", num_patches=" << item.get_num_patches() + << ", use_class_token=" << use_class_token_ + << ", use_position_embeddings=" << use_position_embeddings_ + << ") learning_rate_mult=" << item.learning_rate_multiplier; + return out; + } + + friend void to_xml(const patch_embeddings_& item, std::ostream& out) + { + out << "\n"; + } + + private: + + // Reshape conv output (batch, embed, H/P, W/P) to sequence (batch, 1, num_patches, embed) + void reshape_conv_to_sequence(const tensor& src, tensor& dest) + { + const long batch_size = src.num_samples(); + const long embed_dim = src.k(); + const long h = src.nr(); + const long w = src.nc(); + const long num_patches = h * w; + + const float* src_ptr = src.host(); + float* dest_ptr = dest.host_write_only(); + + // src[n, d, i, j] -> dest[n, 0, i*w + j, d] + for (long n = 0; n < batch_size; ++n) { + for (long i = 0; i < h; ++i) { + for (long j = 0; j < w; ++j) { + const long patch_idx = i * w + j; + for (long d = 0; d < embed_dim; ++d) { + const long src_idx = ((n * embed_dim + d) * h + i) * w + j; + const long dest_idx = (n * num_patches + patch_idx) * embed_dim + d; + dest_ptr[dest_idx] = src_ptr[src_idx]; + } + } + } + } + } + + // Reshape sequence (batch, 1, num_patches, embed) to conv format (batch, embed, H/P, W/P) + void reshape_sequence_to_conv(const tensor& src, tensor& dest) + { + const long batch_size = src.num_samples(); + const long num_patches = src.nr(); + const long embed_dim = src.nc(); + const long h = dest.nr(); + const long w = dest.nc(); + + const float* src_ptr = src.host(); + float* dest_ptr = dest.host_write_only(); + + // src[n, 0, i*w + j, d] -> dest[n, d, i, j] + for (long n = 0; n < batch_size; ++n) { + for (long i = 0; i < h; ++i) { + for (long j = 0; j < w; ++j) { + const long patch_idx = i * w + j; + for (long d = 0; d < embed_dim; ++d) { + const long src_idx = (n * num_patches + patch_idx) * embed_dim + d; + const long dest_idx = ((n * embed_dim + d) * h + i) * w + j; + dest_ptr[dest_idx] = src_ptr[src_idx]; + } + } + } + } + } + + // Prepend class token to patch sequence + void prepend_class_token(const tensor& patches, const tensor& cls_token, tensor& output) + { + const long batch_size = patches.num_samples(); + const long num_patches = patches.nr(); + const long embed_dim = patches.nc(); + const long seq_len = num_patches + 1; + + const float* patches_ptr = patches.host(); + const float* cls_ptr = cls_token.host(); + float* out_ptr = output.host_write_only(); + + for (long n = 0; n < batch_size; ++n) { + // Copy class token to position 0 + for (long d = 0; d < embed_dim; ++d) { + out_ptr[n * seq_len * embed_dim + d] = cls_ptr[d]; + } + // Copy patch embeddings to positions 1..seq_len-1 + for (long s = 0; s < num_patches; ++s) { + for (long d = 0; d < embed_dim; ++d) { + out_ptr[(n * seq_len + s + 1) * embed_dim + d] = + patches_ptr[(n * num_patches + s) * embed_dim + d]; + } + } + } + } + + // Split gradient between class token and patches (writes to alias) + void split_class_token_gradient_to_alias(const tensor& grad_in, tensor& grad_cls, tensor& grad_patches) + { + const long batch_size = grad_in.num_samples(); + const long seq_len = grad_in.nr(); + const long embed_dim = grad_in.nc(); + const long num_patches = seq_len - 1; + + const float* grad_in_ptr = grad_in.host(); + float* grad_cls_ptr = grad_cls.host(); + float* grad_patches_ptr = grad_patches.host_write_only(); + + for (long n = 0; n < batch_size; ++n) { + // Accumulate gradient for class token across batch + for (long d = 0; d < embed_dim; ++d) { + grad_cls_ptr[d] += grad_in_ptr[n * seq_len * embed_dim + d]; + } + // Copy gradient for patches + for (long s = 0; s < num_patches; ++s) { + for (long d = 0; d < embed_dim; ++d) { + grad_patches_ptr[(n * num_patches + s) * embed_dim + d] = + grad_in_ptr[(n * seq_len + s + 1) * embed_dim + d]; + } + } + } + } + + // Sum tensor across batch dimension (writes to alias) + void sum_across_batch_to_alias(const tensor& src, tensor& dest) + { + const long batch_size = src.num_samples(); + const long seq_len = src.nr(); + const long embed_dim = src.nc(); + + const float* src_ptr = src.host(); + float* dest_ptr = dest.host(); + + for (long n = 0; n < batch_size; ++n) { + for (long s = 0; s < seq_len; ++s) { + for (long d = 0; d < embed_dim; ++d) { + dest_ptr[s * embed_dim + d] += src_ptr[(n * seq_len + s) * embed_dim + d]; + } + } + } + } + + // Configuration + long in_channels; + long num_patches_h, num_patches_w; + long cached_input_h, cached_input_w, cached_input_k; + double learning_rate_multiplier; + + // All learnable parameters stored in a single tensor + resizable_tensor params; + + // Alias tensors for accessing parameter regions + alias_tensor filters_alias; // (embedding_dim, in_channels, patch_size, patch_size) + alias_tensor biases_alias; // (1, embedding_dim, 1, 1) + alias_tensor pos_embed_alias; // (1, 1, sequence_length, embedding_dim) if enabled + alias_tensor cls_token_alias; // (1, 1, 1, embedding_dim) if enabled + + // Intermediate tensors for forward/backward + resizable_tensor conv_output; + resizable_tensor patch_sequence; + resizable_tensor grad_conv_output; + resizable_tensor grad_patch_sequence; + + // Convolution operation + tt::tensor_conv conv_op; + }; + + template + using patch_embeddings = add_layer, SUBNET>; + // ---------------------------------------------------------------------------------------- } diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h index cbfe81ad66..3222052ae3 100644 --- a/dlib/dnn/layers_abstract.h +++ b/dlib/dnn/layers_abstract.h @@ -4543,6 +4543,81 @@ namespace dlib > using embeddings = add_layer, SUBNET>; +// ---------------------------------------------------------------------------------------- + + class tril_padding_context + { + /*! + WHAT THIS OBJECT REPRESENTS + This class provides a shared context for communicating padding information + to tril_ layers during forward passes. It solves the problem of nested + architectures where tril_ layers cannot directly access the input sequence. + The context stores per-sample padding lengths that are computed once + before each forward pass and consulted by all tril_ layers. + + THREAD SAFETY + All methods are thread-safe through internal mutex protection. + + TYPICAL USAGE + // Before forward pass: + tril_padding_context::set(input_tensor, padding_token); + // Or from pre-computed lengths: + tril_padding_context::set_from_lengths(padding_lengths); + !*/ + public: + static void set(const tensor& input_tokens, long padding_token); + /*! + ensures + - Computes and stores padding lengths by scanning input_tokens + - For each sample, counts leading tokens equal to padding_token + - #is_set() == true (if padding_token >= 0) + - If padding_token < 0, clears the context instead + !*/ + + static void set_from_lengths(const std::vector& lengths); + /*! + ensures + - Stores the provided padding lengths directly + - #is_set() == true + - #get_padding_length(i) == lengths[i] for all valid i + !*/ + + static void set_uniform(long padding_length, long batch_size); + /*! + ensures + - Sets uniform padding length for all samples + - #is_set() == true + - #get_padding_length(i) == padding_length for i in [0, batch_size) + !*/ + + static void clear(); + /*! + ensures + - #is_set() == false + - Releases stored padding lengths + !*/ + + static long get_padding_length(long sample_idx); + /*! + ensures + - If is_set() and sample_idx is valid: returns padding length for that sample + - Otherwise: returns 0 + !*/ + + static std::vector get_all_lengths(); + /*! + ensures + - Returns a copy of all stored padding lengths + - Returns empty vector if !is_set() + !*/ + + static bool is_set(); + /*! + ensures + - Returns true if padding context has been initialized + !*/ + }; + // ---------------------------------------------------------------------------------------- struct neg_infinity_tag {}; @@ -4665,6 +4740,25 @@ namespace dlib - Returns the parameters of this layer. !*/ + void set_prefix_size(long n_prefix_size); + /*! + ensures + - #get_prefix_size() == n_prefix_size + - Invalidates cached mask if value changed + !*/ + long get_prefix_size() const; + + void set_padding_token(long token_id); + /*! + ensures + - #get_padding_token() == token_id + - If token_id >= 0: enables automatic padding context usage + - If token_id < 0: disables padding masking + !*/ + long get_padding_token() const; + + bool uses_padding_context() const; + friend void serialize(const tril_& item, std::ostream& out); /*! ensures @@ -4818,6 +4912,343 @@ namespace dlib template using act16 = add_layer, SUBNET>; +// ---------------------------------------------------------------------------------------- + + class rotary_positional_embedding_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements a rotary positional embedding (RoPE) layer for neural + networks, as described in "RoFormer: Enhanced Transformer with Rotary Position + Embedding" by Su et al. + + Rotary positional embeddings encode positional information by rotating pairs + of feature dimensions according to their position in the sequence. This method + provides better relative position encoding compared to traditional learned + positional embeddings, particularly for sequence-to-sequence tasks. + + The transformation is applied as a rotation matrix in 2D subspaces: + For each pair of dimensions (i, i+1) at position pos: + [x'_i ] [cos(θ) -sin(θ)] [x_i ] + [x'_i+1] = [sin(θ) cos(θ)] [x_i+1] + + where θ(pos, i) = pos * base^(-2i/d_head) and base is typically 10000. + + DYNAMIC SEQUENCE LENGTH SUPPORT: + This layer automatically adapts to different sequence lengths during + inference. When a sequence of different length is processed, the rotation + angles are recomputed on-the-fly. This allows models trained on shorter + sequences to handle longer contexts at inference time. + + YARN EXTENSION (OPTIONAL): + Optionally supports YaRN (Yet another RoPE extensioN) scaling for + improved extrapolation to longer sequences than seen during training. + YaRN applies frequency-dependent scaling that preserves low-frequency + information while adapting high-frequency components. Enable via + set_yarn_params(). + + This layer has no trainable parameters. All rotation angles are precomputed + during setup based on the sequence length and head dimension. + !*/ + + public: + + rotary_positional_embedding_( + ); + /*! + ensures + - #get_theta_base() == 10000.0 + - #get_seq_len() == 0 + - #get_d_head() == 0 + !*/ + + rotary_positional_embedding_( + const rotary_positional_embedding_& item + ); + /*! + ensures + - Creates a copy of item + - #get_theta_base() == item.get_theta_base() + - #get_seq_len() == item.get_seq_len() + - #get_d_head() == item.get_d_head() + - All precomputed trigonometric caches are copied + !*/ + + rotary_positional_embedding_& operator=( + const rotary_positional_embedding_& item + ); + /*! + ensures + - Assigns item to *this + - returns #*this + !*/ + + void set_theta_base( + float base + ); + /*! + requires + - base > 0 + ensures + - #get_theta_base() == base + - Sets the base frequency for computing rotation angles + - Higher values result in slower rotation with increasing position + - Common values: 10000 (default), 500000 (for longer sequences) + - This should be called before setup() to take effect + !*/ + + float get_theta_base( + ) const; + /*! + ensures + - Returns the base frequency used for rotation angle computation + !*/ + + long get_seq_len( + ) const; + /*! + ensures + - Returns the most recent sequence length processed by this layer + - Returns 0 if forward() has not been called yet + - Note: this value may change between forward() calls if sequences + of different lengths are processed + !*/ + + long get_d_head( + ) const; + /*! + ensures + - Returns the head dimension that this layer was configured for + - Returns 0 if forward() has not been called yet + - This value remains constant once set (determined by network architecture) + !*/ + + void set_yarn_params( + float alpha, + float beta, + long original_len = 0, + bool enabled = true + ); + /*! + requires + - alpha >= 0 + - beta >= 0 + ensures + - Configures YaRN (Yet another RoPE extensioN) scaling parameters + - alpha controls the overall intensity of scaling (typical: 1.0) + - beta controls the curvature of scaling across frequency dimensions (typical: 0.25 to 0.5) + - original_len is the sequence length used during training + If 0, it will be set to the first sequence length observed in forward() + - enabled determines whether YaRN scaling is active + - YaRN allows better extrapolation to sequence lengths longer than training + - Should be called before forward() to take effect + !*/ + + const yarn_config& get_yarn_config( + ) const; + /*! + ensures + - Returns the current YaRN configuration + !*/ + + template + void setup( + const SUBNET& sub + ); + /*! + requires + - sub.get_output().nr() > 0 + - sub.get_output().nc() >= 2 + ensures + - Initializes this layer based on the input dimensions + - #get_seq_len() == sub.get_output().nr() + - #get_d_head() == sub.get_output().nc() + - Precomputes and caches all cosine and sine values for the rotation + angles based on the sequence length and head dimension + - The cos_cache and sin_cache tensors are allocated with shape: + (1, 1, seq_len, d_head/2) + - If d_head is odd, only (d_head-1) dimensions will be rotated + - If YaRN is enabled and original_len is 0, the observed sequence + length is recorded as the training length for YaRN scaling + !*/ + + template + void forward( + const SUBNET& sub, + resizable_tensor& output + ); + /*! + requires + - sub.get_output().nc() >= 2 + - sub.get_output().nr() > 0 + ensures + - Applies rotary positional embeddings to the input + - #output has the same dimensions as sub.get_output() + - If the input sequence length differs from get_seq_len(), or if + this is the first forward pass after deserialization, the rotation + angles are automatically recomputed for the current sequence length. + - For each position pos and dimension pair (i, i+1): + output[pos,i] = input[pos,i] * cos(θ_pos,i/2) - input[pos,i+1] * sin(θ_pos,i/2) + output[pos,i+1] = input[pos,i] * sin(θ_pos,i/2) + input[pos,i+1] * cos(θ_pos,i/2) + - The rotation preserves the magnitude of feature vectors while encoding + relative positional information + - If d_head is odd, the last dimension is copied without rotation + - Expected input shape: (batch_size, num_heads, seq_len, d_head) + - YaRN scaling is applied if enabled via set_yarn_params() + !*/ + + template + void backward( + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ); + /*! + requires + - setup() has been called + - gradient_input has the same dimensions as the output from forward() + ensures + - Computes gradients with respect to the input + - Applies the inverse rotation to gradient_input + - The inverse rotation is: + grad_input[pos,i] = grad_out[pos,i] * cos(θ) + grad_out[pos,i+1] * sin(θ) + grad_input[pos,i+1] = -grad_out[pos,i] * sin(θ) + grad_out[pos,i+1] * cos(θ) + - Accumulated gradients are added to sub.get_gradient_input() + - params_grad is not used (this layer has no trainable parameters) + !*/ + + const tensor& get_layer_params() const; + tensor& get_layer_params(); + inline dpoint map_input_to_output(const dpoint& p) const; + inline dpoint map_output_to_input(const dpoint& p) const; + + friend void serialize(const rotary_positional_embedding_& item, std::ostream& out); + friend void deserialize(rotary_positional_embedding_& item, std::istream& in); + friend std::ostream& operator<<(std::ostream& out, const rotary_positional_embedding_& item); + friend void to_xml(const rotary_positional_embedding_& item, std::ostream& out); + /*! + provides serialization support and output operators + !*/ + + }; + + template + using rope = add_layer; + +// ---------------------------------------------------------------------------------------- + + template < + long patch_size, + long embedding_dim, + long use_class_token, + long use_position_embeddings + > + class patch_embeddings_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This layer implements patch embeddings for Vision Transformers (ViT), as described + in "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" + (Dosovitskiy et al., 2021). + + The layer performs the following operations: + 1. Convolves the input image with filters of size (patch_size x patch_size) + and stride (patch_size) to create a set of projected patches + 2. Reshapes the resulting spatial feature maps into a sequence of vectors + 3. If use_class_token == 1, prepends a learnable 'class token' to the sequence + 4. If use_position_embeddings == 1, adds learnable position embeddings to + the entire sequence + + The input to this layer is a 4D tensor of shape: + (batch_size, in_channels, height, width) + + The output is a 4D tensor representing a sequence: + (batch_size, 1, sequence_length, embedding_dim) + where sequence_length is (height/patch_size * width/patch_size) + use_class_token + + TEMPLATE PARAMETERS + - patch_size: the side length of the square patches (e.g., 16) + - embedding_dim: the dimensionality of the resulting embeddings (e.g., 768) + - use_class_token: set to 1 to prepend a learnable CLS token, 0 otherwise + - use_position_embeddings: set to 1 to add learnable absolute position + embeddings to the sequence, 0 otherwise + !*/ + + public: + + patch_embeddings_( + ); + /*! + ensures + - #get_patch_size() == patch_size + - #get_embedding_dim() == embedding_dim + - #uses_class_token() == use_class_token + - #uses_position_embeddings() == use_position_embeddings + - #get_learning_rate_multiplier() == 1 + !*/ + + long get_patch_size() const; + long get_embedding_dim() const; + long uses_class_token() const; + long uses_position_embeddings() const; + + double get_learning_rate_multiplier() const; + void set_learning_rate_multiplier(double val); + /*! + ensures + - #get_learning_rate_multiplier() == val + !*/ + + template + void setup( + const SUBNET& sub + ); + /*! + requires + - sub.get_output().nr() % patch_size == 0 + - sub.get_output().nc() % patch_size == 0 + ensures + - Initialized the learned parameters: + - projection filters: (embedding_dim, in_channels, patch_size, patch_size) + - projection biases: (embedding_dim) + - (optional) class token and position embeddings. + - Parameters are initialized using Xavier/Glorot initialization for filters + and zero/truncated normal for other components. + !*/ + + template + void forward( + const SUBNET& sub, + resizable_tensor& output + ); + /*! + requires + - setup(sub) has been called. + ensures + - #output.num_samples() == sub.get_output().num_samples() + - #output.k() == 1 + - #output.nr() == (sub.get_output().nr()/patch_size * sub.get_output().nc()/patch_size) + use_class_token + - #output.nc() == embedding_dim + !*/ + + template + void backward( + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ); + /*! + requires + - gradient_input has the same dimensions as the output of forward() + ensures + - Computes the gradient of the loss with respect to the input of this + layer and adds it to #sub.get_gradient_input() + !*/ + }; + + template + using patch_embeddings = add_layer, SUBNET>; + // ---------------------------------------------------------------------------------------- } diff --git a/dlib/dnn/loss.h b/dlib/dnn/loss.h index 36b37a2956..823f2c2352 100644 --- a/dlib/dnn/loss.h +++ b/dlib/dnn/loss.h @@ -911,6 +911,124 @@ namespace dlib using loss_multibinary_log = add_loss_layer; // ---------------------------------------------------------------------------------------- + + class loss_cross_entropy_per_logit_ + { + public: + typedef unsigned long training_label_type; + typedef unsigned long output_label_type; + + loss_cross_entropy_per_logit_() : ignore_index_(-1) {} + + void set_ignore_index(long idx) { ignore_index_ = idx; } + long get_ignore_index() const { return ignore_index_; } + + template + void to_label( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + const tensor& output_tensor = sub.get_output(); + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(output_tensor.k() == 1); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const long batch_size = output_tensor.num_samples(); + const long seq_len = output_tensor.nr(); + const long vocab_size = output_tensor.nc(); + + // Note that output_tensor.nc() should match the vocabulary size + const float* out_data = output_tensor.host(); + + for (long i = 0; i < batch_size; ++i, ++iter) + { + // For each sample, find the class with the maximum logit at the last + // position of the sequence (position seq_len-1). This is the predicted + // next token for autoregressive generation + long max_idx = 0; + float max_val = out_data[tensor_index(output_tensor, i, 0, seq_len - 1, 0)]; + for (long c = 1; c < vocab_size; ++c) + { + const float val = out_data[tensor_index(output_tensor, i, 0, seq_len - 1, c)]; + if (val > max_val) + { + max_val = val; + max_idx = c; + } + } + *iter = static_cast(max_idx); + } + } + + template + double compute_loss_value_and_gradient( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == grad.nr() && + output_tensor.nc() == grad.nc() && + output_tensor.k() == grad.k()); + + double loss = 0.0; +#ifdef DLIB_USE_CUDA + cuda_compute(truth, input_tensor, output_tensor, grad, loss, ignore_index_); +#else + cpu_compute(truth, input_tensor, output_tensor, grad, loss, ignore_index_); +#endif + return loss; + } + + friend void serialize(const loss_cross_entropy_per_logit_& item, std::ostream& out) + { + serialize("loss_cross_entropy_per_logit_", out); + serialize(item.ignore_index_, out); + } + + friend void deserialize(loss_cross_entropy_per_logit_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_cross_entropy_per_logit_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_cross_entropy_per_logit_."); + deserialize(item.ignore_index_, in); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_cross_entropy_per_logit_& item) + { + out << "loss_cross_entropy_per_logit"; + out << " (ignore_index=" << item.ignore_index_ << ")"; + return out; + } + + friend void to_xml(const loss_cross_entropy_per_logit_& item, std::ostream& out) + { + out << "\n"; + } + + private: + long ignore_index_; + +#ifdef DLIB_USE_CUDA + cuda::compute_loss_cross_entropy_per_logit cuda_compute; +#else + cpu::compute_loss_cross_entropy_per_logit cpu_compute; +#endif + }; + + template + using loss_cross_entropy_per_logit = add_loss_layer; + // ---------------------------------------------------------------------------------------- enum class use_image_pyramid : uint8_t diff --git a/dlib/dnn/loss_abstract.h b/dlib/dnn/loss_abstract.h index 9ddfb6a4a2..54d7413e55 100644 --- a/dlib/dnn/loss_abstract.h +++ b/dlib/dnn/loss_abstract.h @@ -810,6 +810,134 @@ namespace dlib using loss_multibinary_log = add_loss_layer; // ---------------------------------------------------------------------------------------- + + class loss_cross_entropy_per_logit_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This loss layer implements cross-entropy loss for next token prediction + in transformer-based language models. Unlike loss_multiclass_log_ which + requires the output to be flattened through an fc layer, this loss function + is designed to work directly with sequence outputs from linear layers. + + This loss expects the network to produce an output tensor with these dimensions: + - output_tensor.num_samples() == batch size + - output_tensor.k() == 1 (always) + - output_tensor.nr() == sequence length + - output_tensor.nc() == vocabulary size (number of classes) + + The key feature of this loss is that it computes the cross-entropy loss + only on the LAST position of each sequence (position nr()-1), which is + the standard approach for autoregressive next token prediction. + + TYPICAL NETWORK ARCHITECTURE: + using net_type = loss_cross_entropy_per_logit + linear> + > + > + > + >; + + TRAINING LABELS: + - Label type: unsigned long (scalar value per sample) + - Each label represents the target token ID: 0 <= label < vocab_size + - One label per sequence (predicting the token after the last position) + + LOSS COMPUTATION: + For each sample i in the batch: + 1. Extract logits at position [i, 0, seq_len-1, :] + 2. Compute softmax: probs = softmax(logits) + 3. Compute loss: loss += -log(probs[target_label]) + + Final loss = sum(all_losses) / batch_size + !*/ + + public: + typedef unsigned long training_label_type; + typedef unsigned long output_label_type; + + template + void to_label( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + requires + - SUBNET implements the EXAMPLE_COMPUTATIONAL_LAYER_ interface + - sub.get_output().k() == 1 + - sub.sample_expansion_factor() == 1 + ensures + - Converts the output of the subnetwork into predicted labels. + - For each sample in the batch, extracts the logits at the last + sequence position (nr()-1) and assigns the index of the maximum + logit as the predicted label. + - Interprets the output tensor as: + output[i, 0, nr()-1, c] = logit for class c in sample i + !*/ + + template + double compute_loss_value_and_gradient( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + requires + - SUBNET implements the EXAMPLE_COMPUTATIONAL_LAYER_ interface + - sub.sample_expansion_factor() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - The output tensor has shape [batch_size, 1, seq_len, vocab_size] + - truth == an iterator pointing to the first label in a sequence + of input_tensor.num_samples() labels + - All values pointed to by truth are < sub.get_output().nc() + (i.e., valid token IDs within vocabulary) + ensures + - Computes the cross-entropy loss for next token prediction. + - For each sample, the loss is computed only at the last sequence + position (nr()-1) using the corresponding label from truth. + - The loss is averaged over all samples in the batch. + - this function returns the loss value. + - Computes gradients with respect to the output logits and stores + them in sub.get_gradient_input(). + - Gradients are non-zero only at the last position of each sequence. + - The gradient computation uses numerically stable softmax. + !*/ + + friend void serialize(const loss_cross_entropy_per_logit_& item, std::ostream& out); + friend void deserialize(loss_cross_entropy_per_logit_& item, std::istream& in); + /*! + provides serialization support for loss_cross_entropy_per_logit_ + !*/ + + friend std::ostream& operator<<(std::ostream& out, const loss_cross_entropy_per_logit_& item); + /*! + prints a human readable string describing the loss layer to the output stream + !*/ + + friend void to_xml(const loss_cross_entropy_per_logit_& item, std::ostream& out); + /*! + provides XML serialization support for loss_cross_entropy_per_logit_ + !*/ + }; + + template + using loss_cross_entropy_per_logit = add_loss_layer; + /*! + This adds the loss_cross_entropy_per_logit_ loss layer onto SUBNET. + + TYPICAL USAGE IN TRANSFORMER NETWORKS: + This loss layer is specifically designed for transformer-based language + models that use autoregressive next token prediction. It should be used + as the final layer of a network that outputs logits for each position + in a sequence. + !*/ + // ---------------------------------------------------------------------------------------- enum class use_image_pyramid : uint8_t diff --git a/dlib/dnn/lr_scheduler.h b/dlib/dnn/lr_scheduler.h new file mode 100644 index 0000000000..0ca8444c36 --- /dev/null +++ b/dlib/dnn/lr_scheduler.h @@ -0,0 +1,385 @@ +// Copyright (C) 2025 Cydral (cydraltechnology@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_LR_SCHEDULER_H_ +#define DLIB_DNN_LR_SCHEDULER_H_ + +#include "lr_scheduler_abstract.h" +#include "../serialize.h" +#include +#include +#include + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + constexpr double lr_scheduler_pi = 3.14159265358979323846; + } + +// ---------------------------------------------------------------------------------------- + + enum class lr_decay_type + { + COSINE, + LINEAR, + CONSTANT, + EXPONENTIAL + }; + +// ---------------------------------------------------------------------------------------- + + class lr_scheduler + { + public: + + lr_scheduler( + ) : + current_step_(0), + warmup_steps_(2000), + hold_steps_(0), + total_steps_(100000), + initial_lr_(1e-7), + peak_lr_(3e-4), + min_lr_(1e-6), + decay_type_(lr_decay_type::COSINE) + { + compute_decay_steps(); + } + + lr_scheduler( + double peak_lr, + size_t warmup_steps, + size_t total_steps, + double min_lr = 1e-6, + lr_decay_type decay_type = lr_decay_type::COSINE + ) : + current_step_(0), + warmup_steps_(warmup_steps), + hold_steps_(0), + total_steps_(total_steps), + initial_lr_(min_lr), + peak_lr_(peak_lr), + min_lr_(min_lr), + decay_type_(decay_type) + { + DLIB_CASSERT(peak_lr > 0, "peak_lr must be positive"); + DLIB_CASSERT(min_lr >= 0, "min_lr must be non-negative"); + DLIB_CASSERT(min_lr < peak_lr, "min_lr must be less than peak_lr"); + DLIB_CASSERT(warmup_steps < total_steps, "warmup_steps must be less than total_steps"); + compute_decay_steps(); + } + + double get_learning_rate( + ) const + { + // Phase 1: Warmup + if (current_step_ < warmup_steps_) + { + if (warmup_steps_ == 0) + return peak_lr_; + const double progress = static_cast(current_step_) / warmup_steps_; + return initial_lr_ + (peak_lr_ - initial_lr_) * progress; + } + + // Phase 2: Hold (optional) + const size_t post_warmup = current_step_ - warmup_steps_; + if (post_warmup < hold_steps_) + return peak_lr_; + + // Phase 3: Decay + if (decay_steps_ == 0) + return peak_lr_; + + const size_t decay_step = post_warmup - hold_steps_; + const double progress = std::min(1.0, static_cast(decay_step) / decay_steps_); + + switch (decay_type_) + { + case lr_decay_type::COSINE: + return min_lr_ + 0.5 * (peak_lr_ - min_lr_) * (1.0 + std::cos(impl::lr_scheduler_pi * progress)); + + case lr_decay_type::LINEAR: + return peak_lr_ - (peak_lr_ - min_lr_) * progress; + + case lr_decay_type::EXPONENTIAL: + return peak_lr_ * std::pow(min_lr_ / peak_lr_, progress); + + case lr_decay_type::CONSTANT: + default: + return peak_lr_; + } + } + + double get_learning_rate( + size_t step + ) const + { + lr_scheduler temp = *this; + temp.current_step_ = step; + return temp.get_learning_rate(); + } + + void step( + size_t n = 1 + ) + { + current_step_ += n; + } + + void reset( + ) + { + current_step_ = 0; + } + + void set_current_step( + size_t step + ) + { + current_step_ = step; + } + + size_t get_current_step( + ) const { return current_step_; } + + size_t get_warmup_steps( + ) const { return warmup_steps_; } + + size_t get_hold_steps( + ) const { return hold_steps_; } + + size_t get_total_steps( + ) const { return total_steps_; } + + size_t get_decay_steps( + ) const { return decay_steps_; } + + double get_initial_lr( + ) const { return initial_lr_; } + + double get_peak_lr( + ) const { return peak_lr_; } + + double get_min_lr( + ) const { return min_lr_; } + + lr_decay_type get_decay_type( + ) const { return decay_type_; } + + void set_peak_lr( + double lr + ) + { + DLIB_CASSERT(lr > 0 && lr > min_lr_); + peak_lr_ = lr; + } + + void set_min_lr( + double lr + ) + { + DLIB_CASSERT(lr >= 0 && lr < peak_lr_); + min_lr_ = lr; + } + + void set_initial_lr( + double lr + ) + { + DLIB_CASSERT(lr >= 0 && lr <= peak_lr_); + initial_lr_ = lr; + } + + void set_warmup_steps( + size_t steps + ) + { + DLIB_CASSERT(steps < total_steps_); + warmup_steps_ = steps; + compute_decay_steps(); + } + + void set_hold_steps( + size_t steps + ) + { + hold_steps_ = steps; + compute_decay_steps(); + } + + void set_total_steps( + size_t steps + ) + { + DLIB_CASSERT(steps > warmup_steps_); + total_steps_ = steps; + compute_decay_steps(); + } + + void set_decay_type( + lr_decay_type type + ) + { + decay_type_ = type; + } + + bool is_warmup_complete( + ) const { return current_step_ >= warmup_steps_; } + + bool is_training_complete( + ) const { return current_step_ >= total_steps_; } + + double get_warmup_progress( + ) const + { + if (warmup_steps_ == 0) + return 1.0; + return std::min(1.0, static_cast(current_step_) / warmup_steps_); + } + + double get_total_progress( + ) const + { + if (total_steps_ == 0) + return 1.0; + return std::min(1.0, static_cast(current_step_) / total_steps_); + } + + std::string get_phase_name( + ) const + { + if (current_step_ < warmup_steps_) + return "warmup"; + else if (current_step_ < warmup_steps_ + hold_steps_) + return "hold"; + else + return "decay"; + } + + private: + + void compute_decay_steps( + ) + { + const size_t non_decay = warmup_steps_ + hold_steps_; + decay_steps_ = (total_steps_ > non_decay) ? (total_steps_ - non_decay) : 0; + } + + size_t current_step_; + size_t warmup_steps_; + size_t hold_steps_; + size_t total_steps_; + size_t decay_steps_; + double initial_lr_; + double peak_lr_; + double min_lr_; + lr_decay_type decay_type_; + }; + +// ---------------------------------------------------------------------------------------- + + inline void serialize( + const lr_scheduler& item, + std::ostream& out + ) + { + serialize("lr_scheduler", out); + serialize(item.get_current_step(), out); + serialize(item.get_warmup_steps(), out); + serialize(item.get_hold_steps(), out); + serialize(item.get_total_steps(), out); + serialize(item.get_decay_steps(), out); + serialize(item.get_initial_lr(), out); + serialize(item.get_peak_lr(), out); + serialize(item.get_min_lr(), out); + serialize(static_cast(item.get_decay_type()), out); + } + + inline void deserialize( + lr_scheduler& item, + std::istream& in + ) + { + std::string version; + deserialize(version, in); + if (version != "lr_scheduler") + throw serialization_error("Unexpected version '" + version + + "' found while deserializing lr_scheduler."); + + size_t current_step, warmup_steps, hold_steps, total_steps, decay_steps; + double initial_lr, peak_lr, min_lr; + int decay_type_int; + + deserialize(current_step, in); + deserialize(warmup_steps, in); + deserialize(hold_steps, in); + deserialize(total_steps, in); + deserialize(decay_steps, in); + deserialize(initial_lr, in); + deserialize(peak_lr, in); + deserialize(min_lr, in); + deserialize(decay_type_int, in); + + item = lr_scheduler(peak_lr, warmup_steps, total_steps, min_lr, + static_cast(decay_type_int)); + item.set_initial_lr(initial_lr); + item.set_hold_steps(hold_steps); + item.set_current_step(current_step); + } + + inline std::ostream& operator<<( + std::ostream& out, + const lr_scheduler& item + ) + { + out << "lr_scheduler (" + << "step=" << item.get_current_step() + << ", lr=" << item.get_learning_rate() + << ", phase=" << item.get_phase_name() + << ", warmup=" << item.get_warmup_steps() + << ", total=" << item.get_total_steps() + << ", peak=" << item.get_peak_lr() + << ", min=" << item.get_min_lr() + << ")"; + return out; + } + +// ---------------------------------------------------------------------------------------- + + inline lr_scheduler make_transformer_scheduler( + double peak_lr, + size_t total_steps, + double warmup_fraction = 0.02, + double min_lr = 1e-6, + lr_decay_type decay_type = lr_decay_type::COSINE + ) + { + DLIB_CASSERT(peak_lr > 0, "peak_lr must be positive"); + DLIB_CASSERT(total_steps > 0, "total_steps must be positive"); + DLIB_CASSERT(warmup_fraction > 0 && warmup_fraction < 1, "warmup_fraction must be in (0, 1)"); + DLIB_CASSERT(min_lr >= 0 && min_lr < peak_lr, "min_lr must be in [0, peak_lr)"); + + size_t warmup_steps = static_cast(total_steps * warmup_fraction); + warmup_steps = std::max(size_t(100), warmup_steps); + return lr_scheduler(peak_lr, warmup_steps, total_steps, min_lr, decay_type); + } + + inline size_t estimate_total_steps( + size_t dataset_size, + size_t batch_size, + size_t num_epochs + ) + { + DLIB_CASSERT(batch_size > 0, "batch_size must be positive"); + const size_t steps_per_epoch = (dataset_size + batch_size - 1) / batch_size; + return steps_per_epoch * num_epochs; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNN_LR_SCHEDULER_H_ diff --git a/dlib/dnn/lr_scheduler_abstract.h b/dlib/dnn/lr_scheduler_abstract.h new file mode 100644 index 0000000000..f1ced39e50 --- /dev/null +++ b/dlib/dnn/lr_scheduler_abstract.h @@ -0,0 +1,481 @@ +// Copyright (C) 2025 Cydral (cydraltechnology@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNN_LR_SCHEDULER_ABSTRACT_H_ +#ifdef DLIB_DNN_LR_SCHEDULER_ABSTRACT_H_ + +#include +#include +#include + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + enum class lr_decay_type + { + /*! + WHAT THIS ENUM REPRESENTS + This enum specifies the type of learning rate decay to use after the + warmup phase completes. The decay function determines how the learning + rate decreases from peak_lr to min_lr over the remaining training steps. + !*/ + + COSINE, + /*! + Cosine annealing decay. The learning rate follows a cosine curve: + lr = min_lr + 0.5 * (peak_lr - min_lr) * (1 + cos(pi * progress)) + + This is the recommended decay type for transformer training as it provides + smooth decay with a gradual slowdown near the end of training. + !*/ + + LINEAR, + /*! + Linear decay. The learning rate decreases linearly: + lr = peak_lr - (peak_lr - min_lr) * progress + + Simple and predictable decay suitable for general deep learning tasks. + !*/ + + CONSTANT, + /*! + No decay after warmup. The learning rate remains at peak_lr: + lr = peak_lr + + Useful when using external learning rate control or for debugging. + !*/ + + EXPONENTIAL + /*! + Exponential decay. The learning rate decreases exponentially: + lr = peak_lr * (min_lr / peak_lr)^progress + + Provides rapid initial decay that slows down over time. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + class lr_scheduler + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements a learning rate scheduler with warmup and decay + phases, designed for training transformer-based neural networks. It is + intended to be used alongside dnn_trainer to provide dynamic learning + rate adjustment during training. + + The schedule consists of three phases: + 1. WARMUP: Linear increase from initial_lr to peak_lr + 2. HOLD (optional): Maintain peak_lr for hold_steps + 3. DECAY: Decrease from peak_lr to min_lr using selected decay type + + MATHEMATICAL FORMULATION + Warmup phase (step < warmup_steps): + lr = initial_lr + (peak_lr - initial_lr) * (step / warmup_steps) + + Hold phase (warmup_steps <= step < warmup_steps + hold_steps): + lr = peak_lr + + Decay phase (step >= warmup_steps + hold_steps): + progress = (step - warmup_steps - hold_steps) / decay_steps + + For COSINE: + lr = min_lr + 0.5 * (peak_lr - min_lr) * (1 + cos(pi * progress)) + + For LINEAR: + lr = peak_lr - (peak_lr - min_lr) * progress + + For EXPONENTIAL: + lr = peak_lr * (min_lr / peak_lr)^progress + + For CONSTANT: + lr = peak_lr + + THREAD SAFETY + This object is not thread-safe. Each trainer should have its own scheduler + instance. If using multiple trainers in parallel, each should maintain its + own lr_scheduler. + + SERIALIZATION + This object supports serialization through serialize() and deserialize() + functions, allowing training to be checkpointed and resumed. + + TYPICAL USAGE + // Create scheduler + lr_scheduler scheduler( + 3e-4, // peak_lr + 2000, // warmup_steps + 100000, // total_steps + 1e-6, // min_lr + lr_decay_type::COSINE + ); + + // Training loop + while (!scheduler.is_training_complete()) { + trainer.set_learning_rate(scheduler.get_learning_rate()); + trainer.train_one_step(data, labels); + scheduler.step(); + } + !*/ + + public: + + lr_scheduler( + ); + /*! + ensures + - Constructs a default scheduler with reasonable defaults for transformer training + - #get_peak_lr() == 3e-4 + - #get_min_lr() == 1e-6 + - #get_initial_lr() == 1e-7 + - #get_warmup_steps() == 2000 + - #get_hold_steps() == 0 + - #get_total_steps() == 100000 + - #get_decay_type() == lr_decay_type::COSINE + - #get_current_step() == 0 + !*/ + + lr_scheduler( + double peak_lr, + size_t warmup_steps, + size_t total_steps, + double min_lr = 1e-6, + lr_decay_type decay_type = lr_decay_type::COSINE + ); + /*! + requires + - peak_lr > 0 + - min_lr >= 0 + - min_lr < peak_lr + - warmup_steps < total_steps + ensures + - #get_peak_lr() == peak_lr + - #get_min_lr() == min_lr + - #get_initial_lr() == min_lr + - #get_warmup_steps() == warmup_steps + - #get_hold_steps() == 0 + - #get_total_steps() == total_steps + - #get_decay_type() == decay_type + - #get_current_step() == 0 + !*/ + + double get_learning_rate( + ) const; + /*! + ensures + - Returns the learning rate for the current step based on the schedule + - The returned value is always >= get_min_lr() + - The returned value is always <= get_peak_lr() + - During warmup: returns a value linearly interpolated between + get_initial_lr() and get_peak_lr() + - During hold: returns get_peak_lr() + - During decay: returns a value determined by get_decay_type() + !*/ + + double get_learning_rate( + size_t step + ) const; + /*! + ensures + - Returns the learning rate that would be used at the specified step + - Does not modify the scheduler state + - Equivalent to temporarily setting current_step to step and calling + get_learning_rate(), then restoring the original current_step + !*/ + + void step( + size_t n = 1 + ); + /*! + ensures + - #get_current_step() == get_current_step() + n + - Advances the scheduler by n steps + !*/ + + void reset( + ); + /*! + ensures + - #get_current_step() == 0 + - Resets the scheduler to its initial state + !*/ + + void set_current_step( + size_t step + ); + /*! + ensures + - #get_current_step() == step + - Useful for resuming training from a checkpoint + !*/ + + size_t get_current_step( + ) const; + /*! + ensures + - Returns the current training step + !*/ + + size_t get_warmup_steps( + ) const; + /*! + ensures + - Returns the number of warmup steps configured for this scheduler + - During warmup, the learning rate increases linearly from + get_initial_lr() to get_peak_lr() + !*/ + + size_t get_hold_steps( + ) const; + /*! + ensures + - Returns the number of hold steps configured for this scheduler + - During hold, the learning rate remains constant at get_peak_lr() + !*/ + + size_t get_total_steps( + ) const; + /*! + ensures + - Returns the total number of training steps configured for this scheduler + - Training is considered complete when get_current_step() >= get_total_steps() + !*/ + + size_t get_decay_steps( + ) const; + /*! + ensures + - Returns the number of steps in the decay phase + - Computed as: get_total_steps() - get_warmup_steps() - get_hold_steps() + !*/ + + double get_initial_lr( + ) const; + /*! + ensures + - Returns the initial learning rate at the start of warmup + - This is the learning rate used at step 0 + !*/ + + double get_peak_lr( + ) const; + /*! + ensures + - Returns the peak learning rate reached at the end of warmup + - This is the maximum learning rate during training + !*/ + + double get_min_lr( + ) const; + /*! + ensures + - Returns the minimum learning rate at the end of training + - The learning rate will never go below this value + !*/ + + lr_decay_type get_decay_type( + ) const; + /*! + ensures + - Returns the decay type used after warmup completes + !*/ + + void set_peak_lr( + double lr + ); + /*! + requires + - lr > 0 + - lr > get_min_lr() + ensures + - #get_peak_lr() == lr + !*/ + + void set_min_lr( + double lr + ); + /*! + requires + - lr >= 0 + - lr < get_peak_lr() + ensures + - #get_min_lr() == lr + !*/ + + void set_initial_lr( + double lr + ); + /*! + requires + - lr >= 0 + - lr <= get_peak_lr() + ensures + - #get_initial_lr() == lr + !*/ + + void set_warmup_steps( + size_t steps + ); + /*! + requires + - steps < get_total_steps() + ensures + - #get_warmup_steps() == steps + - #get_decay_steps() is recomputed accordingly + !*/ + + void set_hold_steps( + size_t steps + ); + /*! + ensures + - #get_hold_steps() == steps + - #get_decay_steps() is recomputed accordingly + !*/ + + void set_total_steps( + size_t steps + ); + /*! + requires + - steps > get_warmup_steps() + ensures + - #get_total_steps() == steps + - #get_decay_steps() is recomputed accordingly + !*/ + + void set_decay_type( + lr_decay_type type + ); + /*! + ensures + - #get_decay_type() == type + !*/ + + bool is_warmup_complete( + ) const; + /*! + ensures + - Returns true if the warmup phase has completed + - Equivalent to: get_current_step() >= get_warmup_steps() + !*/ + + bool is_training_complete( + ) const; + /*! + ensures + - Returns true if all training steps have been completed + - Equivalent to: get_current_step() >= get_total_steps() + !*/ + + double get_warmup_progress( + ) const; + /*! + ensures + - Returns a value between 0.0 and 1.0 indicating progress through warmup + - Returns 1.0 if warmup is complete + - Computed as: min(1.0, get_current_step() / get_warmup_steps()) + !*/ + + double get_total_progress( + ) const; + /*! + ensures + - Returns a value between 0.0 and 1.0 indicating overall training progress + - Computed as: min(1.0, get_current_step() / get_total_steps()) + !*/ + + std::string get_phase_name( + ) const; + /*! + ensures + - Returns "warmup" if in the warmup phase + - Returns "hold" if in the hold phase + - Returns "decay" if in the decay phase + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + void serialize( + const lr_scheduler& item, + std::ostream& out + ); + /*! + ensures + - Serializes the complete state of item to the output stream out + - The serialized state includes: current_step, warmup_steps, hold_steps, + total_steps, decay_steps, initial_lr, peak_lr, min_lr, and decay_type + !*/ + + void deserialize( + lr_scheduler& item, + std::istream& in + ); + /*! + ensures + - Deserializes the state of item from the input stream in + - Restores all configuration and progress state + throws + - serialization_error if the data in 'in' is not valid lr_scheduler data + !*/ + + std::ostream& operator<<( + std::ostream& out, + const lr_scheduler& item + ); + /*! + ensures + - Prints a human-readable summary of the scheduler state to out + - Includes: current step, current learning rate, phase name, and configuration + !*/ + +// ---------------------------------------------------------------------------------------- + + lr_scheduler make_transformer_scheduler( + double peak_lr, + size_t total_steps, + double warmup_fraction = 0.02, + double min_lr = 1e-6, + lr_decay_type decay_type = lr_decay_type::COSINE + ); + /*! + requires + - peak_lr > 0 + - total_steps > 0 + - 0 < warmup_fraction < 1 + - min_lr >= 0 + - min_lr < peak_lr + ensures + - Returns an lr_scheduler configured with common transformer training settings + - The warmup_steps is computed as: max(100, total_steps * warmup_fraction) + - returns a scheduler S such that: + - S.get_peak_lr() == peak_lr + - S.get_total_steps() == total_steps + - S.get_min_lr() == min_lr + - S.get_decay_type() == decay_type + - S.get_warmup_steps() == max(100, total_steps * warmup_fraction) + !*/ + + size_t estimate_total_steps( + size_t dataset_size, + size_t batch_size, + size_t num_epochs + ); + /*! + requires + - batch_size > 0 + ensures + - Returns an estimate of the total number of training steps + - Computed as: ceil(dataset_size / batch_size) * num_epochs + - Useful for configuring lr_scheduler when you know the dataset size, + batch size, and desired number of epochs + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNN_LR_SCHEDULER_ABSTRACT_H_ diff --git a/dlib/dnn/solvers.h b/dlib/dnn/solvers.h index 6eab32be12..d28a5aa93f 100644 --- a/dlib/dnn/solvers.h +++ b/dlib/dnn/solvers.h @@ -397,6 +397,349 @@ namespace dlib float t; }; + // ---------------------------------------------------------------------------------------- + + /*! + AdamW optimizer with decoupled weight decay regularization. + + This optimizer implements the AdamW algorithm from "Decoupled Weight Decay + Regularization" (Loshchilov & Hutter, ICLR 2019). Unlike standard Adam, + AdamW decouples the weight decay from the gradient-based optimization step, + leading to better generalization and easier hyperparameter tuning. + + THEORETICAL FOUNDATION: + Standard Adam with L2 regularization computes: + theta_t = theta_{t-1} - alpha * m_hat_t / sqrt(v_hat_t + epsilon) + where gradients include the L2 regularization term + + AdamW decouples weight decay and computes: + m_t = beta1 * m_{t-1} + (1-beta1) * gradient_L + v_t = beta2 * v_{t-1} + (1-beta2) * (gradient_L)^2 + theta_t = theta_{t-1} - alpha * (m_hat_t/sqrt(v_hat_t) + lambda*theta_{t-1}) + + This formulation makes the optimal weight decay factor independent of + the learning rate, improving generalization especially for long training runs. + + IMPLEMENTATION STRATEGY: + 1. Compute standard Adam update with weight_decay = 0 (decoupled) + 2. Explicitly apply weight decay: update = update - lr * wd * params + 3. The update is then added to parameters by the trainer + + KEY DIFFERENCES FROM ADAM: + - Weight decay is applied directly to parameters (multiplicative) + - Weight decay does not interact with adaptive learning rates + - Better hyperparameter independence (learning rate vs weight decay) + - Superior generalization on image classification and NLP tasks + + CONSTRUCTOR PARAMETERS: + - weight_decay: Decoupled weight decay coefficient (default: 0.01) + Typical range: 0.0001 to 0.1 + Higher values = stronger regularization + - momentum1 (beta1): Exponential decay rate for first moment (default: 0.9) + Controls the momentum of gradient moving average + - momentum2 (beta2): Exponential decay rate for second moment (default: 0.999) + Controls the momentum of squared gradient moving average + + REFERENCES: + - Loshchilov & Hutter (2019). "Decoupled Weight Decay Regularization" + ICLR 2019. https://arxiv.org/abs/1711.05101 + - Kingma & Ba (2015). "Adam: A Method for Stochastic Optimization" + ICLR 2015. https://arxiv.org/abs/1412.6980 + + NOTE: AdamW is the standard optimizer for modern transformer models including + GPT, BERT, LLaMA, Mistral, Qwen, DeepSeek, and other large language models. + It consistently outperforms standard Adam with L2 regularization. + !*/ + class adamw + { + public: + + explicit adamw( + float weight_decay_ = 0.01f, + float momentum1_ = 0.9f, + float momentum2_ = 0.999f + ) + { + weight_decay = weight_decay_; + momentum1 = momentum1_; + momentum2 = momentum2_; + t = 0; + } + + float get_momentum1() const { return momentum1; } + float get_momentum2() const { return momentum2; } + float get_weight_decay() const { return weight_decay; } + + template + const tensor& operator() ( + const float learning_rate, + const layer_type& l, + const tensor& params_grad + ) + { + const tensor& params = l.get_layer_params(); + DLIB_CASSERT(params.size() != 0); + + if (v.size() == 0) + { + m.copy_size(params_grad); + m = 0; + v.copy_size(params_grad); + v = 0; + s.copy_size(params_grad); + } + + ++t; + + // Step 1: compute standard Adam update with decoupled weight decay (wd = 0) + // This populates 's' with the adaptive gradient step: -alpha * m_hat_t / sqrt(v_hat_t) + // By passing weight_decay = 0, we decouple the regularization from the adaptive update + tt::compute_adam_update(0, params.size(), s, m, v, t, + learning_rate * get_learning_rate_multiplier(l), + 0, // Critical: weight_decay = 0 for decoupled regularization + momentum1, momentum2, params, params_grad); + + // Step 2: apply decoupled weight decay explicitly + // Formula: s = s - alpha * lambda * theta_{t-1} + // This implements the AdamW update: theta_t = theta_{t-1} - alpha * (m_hat_t/sqrt(v_hat_t) + lambda * theta_{t-1}) + const double lr = learning_rate * get_learning_rate_multiplier(l); + const double wd = weight_decay * get_weight_decay_multiplier(l); + + if (wd != 0) + { + // Compute: s = s + params * (-lr * wd) + tt::affine_transform(s, s, params, 1.0, -lr * wd); + } + + return s; + } + + template + const tensor& operator() ( + const float learning_rate, + const fc_& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size() - l.get_num_outputs()); + return s; + } + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y, + int _padding_x + > + const tensor& operator() ( + const float learning_rate, + const con_<_num_filters, _nr, _nc, _stride_y, _stride_x, _padding_y, _padding_x>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size() - l.num_filters()); + return s; + } + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y, + int _padding_x + > + const tensor& operator() ( + const float learning_rate, + const cont_<_num_filters, _nr, _nc, _stride_y, _stride_x, _padding_y, _padding_x>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size() - l.num_filters()); + return s; + } + + template < layer_mode mode > + const tensor& operator() ( + const float learning_rate, + const bn_& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size() / 2); + return s; + } + + friend void serialize(const adamw& item, std::ostream& out) + { + serialize("adamw", out); + serialize(item.m, out); + serialize(item.v, out); + serialize(item.s, out); + serialize(item.weight_decay, out); + serialize(item.momentum1, out); + serialize(item.momentum2, out); + serialize(item.t, out); + } + + friend void deserialize(adamw& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "adamw") + throw serialization_error("Unexpected version found while deserializing dlib::adamw."); + deserialize(item.m, in); + deserialize(item.v, in); + deserialize(item.s, in); + deserialize(item.weight_decay, in); + deserialize(item.momentum1, in); + deserialize(item.momentum2, in); + deserialize(item.t, in); + } + + friend std::ostream& operator<< (std::ostream& out, const adamw& item) + { + out << "adamw: weight_decay=" << item.get_weight_decay() + << ", momentum1=" << item.get_momentum1() + << ", momentum2=" << item.get_momentum2(); + return out; + } + + private: + + /*! + Updates parameters that may have different learning rate and weight decay + multipliers for weights vs biases (e.g., fully connected and convolutional layers). + + BIAS HANDLING: + Most layers separate weights and biases: + - Weights: indices [0, bias_offset) + - Biases: indices [bias_offset, end) + + Different multipliers may apply to each section: + - bias_learning_rate_multiplier (typically 1.0 or 2.0) + - bias_weight_decay_multiplier (typically 0.0 - no decay on biases) + + PARAMETERS: + - learning_rate: base learning rate from trainer + - l: layer containing parameters and multiplier settings + - params_grad: gradient tensor + - bias_offset: index where biases start in the parameter tensor + !*/ + template + void update_considering_bias( + const float learning_rate, + const layer_type& l, + const tensor& params_grad, + unsigned long bias_offset + ) + { + const tensor& params = l.get_layer_params(); + DLIB_CASSERT(params.size() != 0); + + if (v.size() == 0) + { + m.copy_size(params_grad); + m = 0; + v.copy_size(params_grad); + v = 0; + s.copy_size(params_grad); + } + + ++t; + + // Step 1: compute adaptive gradient update with decoupled weight decay + if (l.get_bias_learning_rate_multiplier() == 1) + { + // Simple case: uniform learning rate for all parameters + tt::compute_adam_update(0, params.size(), s, m, v, t, + learning_rate * get_learning_rate_multiplier(l), + 0, // Decoupled: weight_decay = 0 in Adam computation + momentum1, momentum2, params, params_grad); + } + else + { + // Complex case: different learning rates for weights and biases + + // Process weights: indices [0, bias_offset) + tt::compute_adam_update(0, bias_offset, s, m, v, t, + learning_rate * get_learning_rate_multiplier(l), + 0, // Decoupled weight decay + momentum1, momentum2, params, params_grad); + + // Process biases: indices [bias_offset, end) + // Apply bias learning rate multiplier + tt::compute_adam_update(bias_offset, params.size(), s, m, v, t, + learning_rate * get_learning_rate_multiplier(l) * l.get_bias_learning_rate_multiplier(), + 0, // Decoupled weight decay + momentum1, momentum2, params, params_grad); + } + + // Step 2: apply decoupled weight decay + // Formula: s = s - lr * wd * params + // This is applied separately to weights and biases because they may have + // different weight decay multipliers + double lr = learning_rate * get_learning_rate_multiplier(l); + double wd = weight_decay * get_weight_decay_multiplier(l); + + if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1) + { + // Simple case: uniform weight decay for all parameters + if (wd != 0) + tt::affine_transform(s, s, params, 1.0, -lr * wd); + } + else + { + // Complex case: different weight decay for weights vs biases + + // Apply weight decay to weights: indices [0, bias_offset) + // Computation: s[i] = 1.0 * s[i] + (-lr * wd) * params[i] + 0.0 * params[i] + // The third source (params) is not used since C = 0.0 + if (wd != 0) + { + tt::affine_transform_range(0, bias_offset, + s, // dest + s, // src1 (A coefficient) + params, // src2 (B coefficient) + params, // src3 (C coefficient = 0, so this is unused) + 1.0, // A: keep current update + -lr * wd, // B: subtract weight decay term + 0.0); // C: ignore third source + } + + // Apply weight decay to biases: indices [bias_offset, end) + // Note: typically bias_weight_decay_multiplier = 0 (no regularization on biases) + // This is a common practice in deep learning to prevent biases from becoming too small + lr *= l.get_bias_learning_rate_multiplier(); + wd *= l.get_bias_weight_decay_multiplier(); + + if (wd != 0) + { + tt::affine_transform_range(bias_offset, v.size(), + s, + s, + params, + params, + 1.0, + -lr * wd, + 0.0); + } + } + } + + resizable_tensor m; // First moment estimate (exponential moving average of gradients) + resizable_tensor v; // Second moment estimate (exponential moving average of squared gradients) + resizable_tensor s; // Parameter update computed by the optimizer + float weight_decay; // Weight decay coefficient (lambda in the paper) + float momentum1; // Beta1: decay rate for first moment + float momentum2; // Beta2: decay rate for second moment + float t; // Time step counter for bias correction + }; + // ---------------------------------------------------------------------------------------- } diff --git a/dlib/dnn/solvers_abstract.h b/dlib/dnn/solvers_abstract.h index 7a07452170..20c37987dd 100644 --- a/dlib/dnn/solvers_abstract.h +++ b/dlib/dnn/solvers_abstract.h @@ -9,8 +9,6 @@ namespace dlib { -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------- class EXAMPLE_SOLVER @@ -69,8 +67,6 @@ namespace dlib Prints the solver's name and parameters to out. !*/ -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------- class sgd @@ -196,6 +192,82 @@ namespace dlib Prints the solver's name and parameters to out. !*/ +// ---------------------------------------------------------------------------------------- + + class adamw + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the EXAMPLE_SOLVER interface defined above. In + particular, it implements the AdamW parameter update method with decoupled + weight decay regularization as described in the paper: + Loshchilov, Ilya, and Frank Hutter. "Decoupled weight decay + regularization." International Conference on Learning Representations. 2019. + + The key difference from standard Adam is that weight decay is decoupled from + the gradient-based optimization step. This leads to better generalization + performance and makes the optimal weight decay factor more independent of the + learning rate setting. AdamW has become the standard optimizer for training + large language models and transformer architectures. + + The update is computed as: + m_t = momentum1*m_{t-1} + (1-momentum1)*params_grad + v_t = momentum2*v_{t-1} + (1-momentum2)*(params_grad^2) + V = -learning_rate * (m_hat_t/sqrt(v_hat_t) + weight_decay*l.get_layer_params()) + where m_hat_t and v_hat_t are bias-corrected moment estimates. + + Note that the actual learning rate and weight decay used by the solver are + multiplied by the per layer multipliers. That is, the solver will call + get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and + multiply these values with the nominal learning rate and weight decay, + respectively, to determine the values it will use during each step. It is + also overloaded to allow additional learning rate multipliers to be applied + to fc_ and con_ bias parameters. + !*/ + + public: + + adamw( + ); + /*! + ensures + - #get_weight_decay() == 0.01 + - #get_momentum1() == 0.9 + - #get_momentum2() == 0.999 + !*/ + + explicit adamw( + float weight_decay, + float momentum1 = 0.9, + float momentum2 = 0.999 + ); + /*! + requires + - weight_decay >= 0 + - 0 <= momentum1 < 1 + - 0 <= momentum2 < 1 + ensures + - #get_weight_decay() == weight_decay + - #get_momentum1() == momentum1 + - #get_momentum2() == momentum2 + !*/ + + float get_weight_decay() const; + float get_momentum1() const; + float get_momentum2() const; + }; + + void serialize(const adamw& item, std::ostream& out); + void deserialize(adamw& item, std::istream& in); + /*! + provides serialization support + !*/ + + std::ostream& operator<< (std::ostream& out, const adamw& item); + /*! + Prints the solver's name and parameters to out. + !*/ + // ---------------------------------------------------------------------------------------- } diff --git a/dlib/dnn/trainer.h b/dlib/dnn/trainer.h index c329791e78..3cdc6fa1ec 100644 --- a/dlib/dnn/trainer.h +++ b/dlib/dnn/trainer.h @@ -11,6 +11,7 @@ #include #include #include "../serialize.h" +#include "lr_scheduler.h" #include "../pipe.h" #include "../threads.h" diff --git a/dlib/dnn/transformer.h b/dlib/dnn/transformer.h new file mode 100644 index 0000000000..786e8ea8a0 --- /dev/null +++ b/dlib/dnn/transformer.h @@ -0,0 +1,1019 @@ +// Copyright (C) 2025 Cydral Technology (cydraltechnology@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_TRANSFORMER_H_ +#define DLIB_DNN_TRANSFORMER_H_ + +#include "transformer_abstract.h" +#include "layers.h" + +namespace dlib +{ + // ---------------------------------------------------------------------------------------- + + template + class scale_weights_ : public multiply_ + { + public: + explicit scale_weights_() : multiply_(1.0f / std::sqrt(static_cast(d_k_))) {} + }; + + template + using scale_weights = add_layer, SUBNET>; + + // ---------------------------------------------------------------------------------------- + + template + using positional_embeddings = positional_encodings< + embeddings>; + + // ---------------------------------------------------------------------------------------- + + // CANONICAL TRANSFORMER ARCHITECTURE + namespace canonical_transformer + { + + template + using query = reshape_to>; + + template + using key = reshape_to>; + + template + using value = reshape_to>; + + template