From 3e9b9f14290e034e6ddcf76c71b4f4182b377520 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 28 Apr 2025 22:10:06 +0200 Subject: [PATCH 01/21] Implementation of linear_ layer for neural networks. This layer provides an optimized linear transformation for multi-dimensional inputs. --- dlib/dnn/layers.h | 200 ++++++++++++++++++++++++++++ dlib/dnn/layers_abstract.h | 260 +++++++++++++++++++++++++++++++++++++ dlib/test/dnn.cpp | 75 +++++++++++ 3 files changed, 535 insertions(+) diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h index 0a0c547f33..445864b4de 100644 --- a/dlib/dnn/layers.h +++ b/dlib/dnn/layers.h @@ -2143,6 +2143,206 @@ namespace dlib > using fc_no_bias = add_layer, SUBNET>; +// ---------------------------------------------------------------------------------------- + + enum linear_bias_mode { LINEAR_HAS_BIAS = 0, LINEAR_NO_BIAS = 1 }; + + template < + unsigned long num_outputs_, + linear_bias_mode bias_mode_ + > + class linear_ + { + static_assert(num_outputs_ > 0, "The number of outputs from a linear_ layer must be > 0"); + + public: + linear_() : + num_outputs(num_outputs_), + num_inputs(0), + learning_rate_multiplier(1), + bias_mode(bias_mode_) { + } + + double get_learning_rate_multiplier() const { return learning_rate_multiplier; } + void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; } + + unsigned long get_num_inputs() const { return num_inputs; } + unsigned long get_num_outputs() const { return num_outputs; } + void set_num_outputs(long num) + { + DLIB_CASSERT(num > 0); + if (num != (long)num_outputs) + { + DLIB_CASSERT(get_layer_params().size() == 0, + "You can't change the number of filters in linear_ if the parameter tensor has already been allocated."); + num_outputs = num; + } + } + linear_bias_mode get_bias_mode() const { return bias_mode; } + + template + void setup(const SUBNET& sub) + { + num_inputs = sub.get_output().nc(); + if (bias_mode == LINEAR_HAS_BIAS) + params.set_size(num_inputs + 1, num_outputs); + else + params.set_size(num_inputs, num_outputs); + + dlib::rand rnd(std::rand()); + randomize_parameters(params, num_inputs + num_outputs, rnd); + weights = alias_tensor(num_inputs, num_outputs); + + if (bias_mode == LINEAR_HAS_BIAS) { + biases = alias_tensor(1, num_outputs); + biases(params, weights.size()) = 0; + } + } + + template + void forward(const SUBNET& sub, resizable_tensor& output) + { + const auto& prev_output = sub.get_output(); + DLIB_CASSERT((long)num_inputs == sub.get_output().nc(), + "The size of the input tensor to this linear layer doesn't match the size the linear layer was trained with."); + output.set_size(prev_output.num_samples(), prev_output.k(), prev_output.nr(), num_outputs); + + auto o = alias_tensor(output.num_samples() * output.k() * output.nr(), num_outputs)(output, 0); + auto so = alias_tensor(prev_output.num_samples() * prev_output.k() * prev_output.nr(), num_inputs)(prev_output, 0); + + auto w = weights(params, 0); + tt::gemm(0, (tensor&)o, 1, so, false, w, false); + + if (bias_mode == LINEAR_HAS_BIAS) + { + auto b = biases(params, weights.size()); + tt::add(1, (tensor&)o, 1, b); + } + } + + template + void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) + { + auto gi = alias_tensor(gradient_input.num_samples() * gradient_input.k() * gradient_input.nr(), num_outputs)(gradient_input, 0); + if (learning_rate_multiplier != 0) + { + const auto& prev_output = sub.get_output(); + auto pw = weights(params_grad, 0); + auto so = alias_tensor(prev_output.num_samples() * prev_output.k() * prev_output.nr(), num_inputs)(prev_output, 0); + tt::gemm(0, pw, learning_rate_multiplier, so, true, gi, false); + + if (bias_mode == LINEAR_HAS_BIAS) + { + auto pb = biases(params_grad, weights.size()); + tt::assign_bias_gradient(pb, gi); + } + } + + const auto& prev_gradient = sub.get_gradient_input(); + auto sgi = alias_tensor(prev_gradient.num_samples() * prev_gradient.k() * prev_gradient.nr(), num_inputs)(prev_gradient, 0); + auto w = weights(params, 0); + tt::gemm(1, (tensor&)sgi, 1, gi, false, w, true); + } + + alias_tensor_instance get_weights() { return weights(params, 0); } + alias_tensor_const_instance get_weights() const { return weights(params, 0); } + alias_tensor_instance get_biases() + { + static_assert(bias_mode == LINEAR_HAS_BIAS, "This linear_ layer doesn't have a bias vector " + "to be retrieved, as per template parameter 'bias_mode'."); + return biases(params, weights.size()); + } + alias_tensor_const_instance get_biases() const + { + static_assert(bias_mode == LINEAR_HAS_BIAS, "This linear_ layer doesn't have a bias vector " + "to be retrieved, as per template parameter 'bias_mode'."); + return biases(params, weights.size()); + } + + inline dpoint map_input_to_output(const dpoint& p) const { return p; } + inline dpoint map_output_to_input(const dpoint& p) const { return p; } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const linear_& item, std::ostream& out) + { + serialize("linear_", out); + serialize(item.num_outputs, out); + serialize(item.num_inputs, out); + serialize(item.params, out); + serialize(item.weights, out); + serialize(item.biases, out); + serialize((int)item.bias_mode, out); + serialize(item.learning_rate_multiplier, out); + } + + friend void deserialize(linear_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version == "linear_") + { + deserialize(item.num_outputs, in); + deserialize(item.num_inputs, in); + deserialize(item.params, in); + deserialize(item.weights, in); + deserialize(item.biases, in); + int bmode; + deserialize(bmode, in); + item.bias_mode = static_cast(bmode); + if (bias_mode_ != item.bias_mode) throw serialization_error("Wrong bias_mode found while deserializing dlib::linear_"); + deserialize(item.learning_rate_multiplier, in); + } + else + { + throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::linear_."); + } + } + + friend std::ostream& operator<<(std::ostream& out, const linear_& item) + { + out << "linear\t (num_outputs=" << item.num_outputs; + if (item.bias_mode == LINEAR_HAS_BIAS) + out << ", bias=true"; + else + out << ", bias=false"; + out << ")"; + out << " learning_rate_mult=" << item.learning_rate_multiplier; + return out; + } + + friend void to_xml(const linear_& item, std::ostream& out) + { + out << "\n"; + out << mat(item.params); + out << "\n"; + } + + private: + unsigned long num_inputs; + unsigned long num_outputs; + double learning_rate_multiplier; + linear_bias_mode bias_mode; + resizable_tensor params; + alias_tensor weights, biases; + }; + + template < + unsigned long num_outputs, + typename SUBNET + > + using linear = add_layer, SUBNET>; + + template < + unsigned long num_outputs, + typename SUBNET + > + using linear_no_bias = add_layer, SUBNET>; + // ---------------------------------------------------------------------------------------- class dropout_ diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h index ef2de8e6fe..e5f2d340e0 100644 --- a/dlib/dnn/layers_abstract.h +++ b/dlib/dnn/layers_abstract.h @@ -689,6 +689,266 @@ namespace dlib > using fc_no_bias = add_layer, SUBNET>; + // ---------------------------------------------------------------------------------------- + +// ---------------------------------------------------------------------------------------- + + enum linear_bias_mode + { + LINEAR_HAS_BIAS, + LINEAR_NO_BIAS + }; + + template < + unsigned long num_outputs, + linear_bias_mode bias_mode + > + class linear_ + { + /*! + REQUIREMENTS ON num_outputs + num_outputs > 0 + + WHAT THIS OBJECT REPRESENTS + This is an implementation of a linear layer, which applies a linear + transformation to the input data. For a layer with bias, the transformation + is: + output = input * weights + bias + For a layer without bias, it's simply: + output = input * weights + + The input tensor can have any number of sample, k (channel), and nr (row) + dimensions, but the nc (column) dimension must match the number of input features. + The output tensor will have the same dimensions as the input tensor, except for + the nc dimension which will be equal to num_outputs. + + This layer is similar to the fc_ layer, but optimized for the case where the + input and output tensors maintain the same dimensions, excluding the feature + dimension (nc). This makes it useful for working with multi-dimensional data. + !*/ + + public: + linear_( + ); + /*! + ensures + - #get_num_outputs() == num_outputs + - #get_bias_mode() == bias_mode + - #get_learning_rate_multiplier() == 1 + !*/ + + double get_learning_rate_multiplier( + ) const; + /*! + ensures + - returns a multiplier that will be applied to the gradient of this layer during + training. This value appears as a multiplicative factor in the update rule. So + if get_learning_rate_multiplier() == 1 then the learning rate will be multiplied + by 1 and thus not modified. However, if get_learning_rate_multiplier() == 0.1 then + the learning rate will be multiplied by 0.1, making the layer update 10 times + slower than it would otherwise be. + !*/ + + void set_learning_rate_multiplier( + double val + ); + /*! + ensures + - #get_learning_rate_multiplier() == val + !*/ + + unsigned long get_num_inputs( + ) const; + /*! + ensures + - Returns the number of input features this layer expects. + - For an uninitialized layer (i.e., one that has not seen any data during setup + or forward pass), this will be zero. + !*/ + + unsigned long get_num_outputs( + ) const; + /*! + ensures + - Returns the number of output features this layer produces. + I.e., this value is num_outputs. + !*/ + + void set_num_outputs( + long num + ); + /*! + requires + - num > 0 + ensures + - #get_num_outputs() == num + throws + - std::runtime_error if this function is called after the layer parameters + have been allocated and the new number of outputs doesn't match the + previously set number of outputs. + !*/ + + linear_bias_mode get_bias_mode( + ) const; + /*! + ensures + - Returns a value indicating whether this layer has a bias term. + I.e. returns bias_mode. + !*/ + + template + void setup( + const SUBNET& sub + ); + /*! + ensures + - Performs the necessary setup work to process data through this layer. + - Sets the input size based on the dimensions of the input tensor from sub. + - Allocates the parameter tensor and initializes its values. + - #get_num_inputs() == the number of columns in sub.get_output() (i.e., nc). + !*/ + + template + void forward( + const SUBNET& sub, + resizable_tensor& output + ); + /*! + requires + - setup() has been called + - sub.get_output().nc() == get_num_inputs() + ensures + - Applies the linear transformation to the input tensor from sub and stores + the results in output. + - #output.num_samples() == sub.get_output().num_samples() + - #output.k() == sub.get_output().k() + - #output.nr() == sub.get_output().nr() + - #output.nc() == get_num_outputs() + !*/ + + template + void backward( + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ); + /*! + requires + - setup() has been called + - sub.get_output().nc() == get_num_inputs() + - gradient_input has the same dimensions as the output of forward() + ensures + - Computes the gradients of this layer with respect to the parameters + and the input tensor, and updates the corresponding gradient tensors. + - Updates params_grad based on the gradients of the weights + and biases (if present). + - Updates sub's gradient_input based on the gradients of the + inputs to this layer. + !*/ + + alias_tensor_instance get_weights( + ); + /*! + requires + - setup() has been called + ensures + - Returns a reference to the weights matrix of this layer. + !*/ + + alias_tensor_const_instance get_weights( + ) const; + /*! + requires + - setup() has been called + ensures + - Returns a const reference to the weights matrix of this layer. + !*/ + + alias_tensor_instance get_biases( + ); + /*! + requires + - bias_mode == LINEAR_HAS_BIAS + - setup() has been called + ensures + - Returns a reference to the bias vector of this layer. + throws + - static_assert failure if bias_mode != LINEAR_HAS_BIAS + !*/ + + alias_tensor_const_instance get_biases( + ) const; + /*! + requires + - bias_mode == LINEAR_HAS_BIAS + - setup() has been called + ensures + - Returns a const reference to the bias vector of this layer. + throws + - static_assert failure if bias_mode != LINEAR_HAS_BIAS + !*/ + + dpoint map_input_to_output( + const dpoint& p + ) const; + /*! + ensures + - Returns p, since the linear layer maintains the same spatial dimensions. + !*/ + + dpoint map_output_to_input( + const dpoint& p + ) const; + /*! + ensures + - Returns p, since the linear layer maintains the same spatial dimensions. + !*/ + + const tensor& get_layer_params( + ) const; + /*! + ensures + - Returns the parameters that define this layer, i.e., the weights and biases + (if present) that are updated during training. + !*/ + + tensor& get_layer_params( + ); + /*! + ensures + - Returns the parameters that define this layer, i.e., the weights and biases + (if present) that are updated during training. + !*/ + + friend void serialize(const linear_& item, std::ostream& out); + friend void deserialize(linear_& item, std::istream& in); + /*! + provides serialization support + !*/ + }; + + template < + unsigned long num_outputs, + typename SUBNET + > + using linear = add_layer, SUBNET>; + /*! + This is a layer that applies a linear transformation with bias to the input: + output = input * weights + bias + !*/ + + template < + unsigned long num_outputs, + typename SUBNET + > + using linear_no_bias = add_layer, SUBNET>; + /*! + This is a layer that applies a linear transformation without bias to the input: + output = input * weights + !*/ + + // ---------------------------------------------------------------------------------------- + // ---------------------------------------------------------------------------------------- struct num_con_outputs diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp index 9316a0edc6..fae2f54d49 100644 --- a/dlib/test/dnn.cpp +++ b/dlib/test/dnn.cpp @@ -2419,6 +2419,24 @@ void test_embeddings() auto res = test_layer(l); DLIB_TEST_MSG(res, res); } + { + print_spinner(); + linear_<1, LINEAR_NO_BIAS> l; + auto res = test_layer(l); + DLIB_TEST_MSG(res, res); + } + { + print_spinner(); + linear_<5, LINEAR_NO_BIAS> l; + auto res = test_layer(l); + DLIB_TEST_MSG(res, res); + } + { + print_spinner(); + linear_<4, LINEAR_NO_BIAS> l; + auto res = test_layer(l); + DLIB_TEST_MSG(res, res); + } { print_spinner(); relu_ l; @@ -3527,6 +3545,62 @@ void test_multm_prev() DLIB_TEST_MSG(error_after < 1e-6, "Autoencoder error after training = " << error_after); } +// ---------------------------------------------------------------------------------------- + void test_linear() + { + print_spinner(); + + // Define the network + cout << "ICI !!!" << endl; + using net_type = tag2>>>>; + net_type net; + + // Input tensor + const int n_samples = 3, k = 1; + std::vector> x(n_samples); + matrix xtmp(2, 4); + xtmp = 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f; + x[0] = xtmp; + xtmp = 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f; + x[1] = xtmp; + xtmp = 17.0f, 18.0f, 19.0f, 20.0f, + 21.0f, 22.0f, 23.0f, 24.0f; + x[2] = xtmp; + + // Convert input matrix to tensor + resizable_tensor input_tensor; + net.to_tensor(&x[0], &x[0] + n_samples, input_tensor); + net.forward(input_tensor); + + // Get the internal linear weights + matrix w = mat(layer(net).subnet().layer_details().get_weights()); + + // Theoretical calculation of the output + std::vector> expected_outputs(n_samples); + for (int i = 0; i < n_samples; ++i) { + matrix input_matrix = x[i]; + expected_outputs[i] = input_matrix * w; + } + + // Compare output tensor with expected output + auto& net_output = layer(net).get_output(); + + // Display results + for (int i = 0; i < n_samples; ++i) { + matrix output_sample; + output_sample.set_size(2, 6); + for (long r = 0; r < output_sample.nr(); ++r) { + for (long c = 0; c < output_sample.nc(); ++c) { + output_sample(r, c) = net_output.host()[tensor_index(net_output, i, 0, r, c)]; + } + } + DLIB_TEST_MSG(max(abs(output_sample - expected_outputs[i])) < 1e-5, + "linear layer - sample " + std::to_string(i)); + } + } + // ---------------------------------------------------------------------------------------- void test_loss_mean_squared_per_channel_and_pixel() @@ -5107,6 +5181,7 @@ void test_multm_prev() test_simple_linear_regression_with_mult_prev(); test_multioutput_linear_regression(); test_simple_autoencoder(); + test_linear(); test_loss_mean_squared_per_channel_and_pixel(); test_loss_binary_log_per_pixel_learned_params_on_trivial_two_pixel_task(); test_loss_binary_log_per_pixel_outputs_on_trivial_task(); From 93ead3d113535b150c97b0e2c23b293f502ac2a7 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 2 May 2025 22:12:40 +0200 Subject: [PATCH 02/21] Minor change --- dlib/dnn/layers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h index 445864b4de..a9b5da0ca0 100644 --- a/dlib/dnn/layers.h +++ b/dlib/dnn/layers.h @@ -2203,8 +2203,8 @@ namespace dlib void forward(const SUBNET& sub, resizable_tensor& output) { const auto& prev_output = sub.get_output(); - DLIB_CASSERT((long)num_inputs == sub.get_output().nc(), - "The size of the input tensor to this linear layer doesn't match the size the linear layer was trained with."); + DLIB_CASSERT((long)num_inputs == prev_output.nc(), + "The size of the input tensor to this linear layer doesn't match the size the linear layer was trained with."); output.set_size(prev_output.num_samples(), prev_output.k(), prev_output.nr(), num_outputs); auto o = alias_tensor(output.num_samples() * output.k() * output.nr(), num_outputs)(output, 0); From bf1b805ad53208762dfd8f87569b3caba7d22360 Mon Sep 17 00:00:00 2001 From: "Davis E. King" Date: Sat, 3 May 2025 10:35:10 -0400 Subject: [PATCH 03/21] Update dlib/dnn/layers.h Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- dlib/dnn/layers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h index a9b5da0ca0..123d49f523 100644 --- a/dlib/dnn/layers.h +++ b/dlib/dnn/layers.h @@ -2170,7 +2170,7 @@ namespace dlib unsigned long get_num_outputs() const { return num_outputs; } void set_num_outputs(long num) { - DLIB_CASSERT(num > 0); + DLIB_CASSERT(num > 0, "The number of outputs must be > 0, but num == " << num); if (num != (long)num_outputs) { DLIB_CASSERT(get_layer_params().size() == 0, From f234faaaa0337806bd4982f40eb9bc896380d41c Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 6 May 2025 17:40:06 +0200 Subject: [PATCH 04/21] Add reshape_to and flatten layers to Dlib's DNN module --- dlib/dnn/layers.h | 180 +++++++++++++++++++++++++++++++++++++ dlib/dnn/layers_abstract.h | 170 +++++++++++++++++++++++++++++++++++ dlib/test/dnn.cpp | 48 +++++++++- 3 files changed, 397 insertions(+), 1 deletion(-) diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h index 123d49f523..7ec8b1a956 100644 --- a/dlib/dnn/layers.h +++ b/dlib/dnn/layers.h @@ -975,6 +975,186 @@ namespace dlib > using resize_to = add_layer, SUBNET>; +// ---------------------------------------------------------------------------------------- + + template + class reshape_to_ + { + public: + explicit reshape_to_() : + output_k(k_), + output_nr(nr_), + output_nc(nc_) + { + static_assert(k_ == -1 || k_ > 0, "Output k must be positive or -1"); + static_assert(nr_ == -1 || nr_ > 0, "Output nr must be positive or -1"); + static_assert(nc_ == -1 || nc_ > 0, "Output nc must be positive or -1"); + + input_k = input_nr = input_nc = 0; + needs_rescale = false; + } + + // Getters for dimensions + long get_output_k() const { return output_k; } + long get_output_nr() const { return output_nr; } + long get_output_nc() const { return output_nc; } + + // Setters for dimensions + void set_output_k(long k) { + DLIB_CASSERT(k == -1 || k > 0, "Output k must be positive or -1 to keep original dimension"); + output_k = k; + } + void set_output_nr(long nr) { + DLIB_CASSERT(nr == -1 || nr > 0, "output nr must be positive or -1 to keep original dimension"); + output_nr = nr; + } + void set_output_nc(long nc) { + DLIB_CASSERT(nc == -1 || nc > 0, "output nc must be positive or -1 to keep original dimension"); + output_nc = nc; + } + + template + void setup(const SUBNET& sub) + { + const auto& input = sub.get_output(); + input_k = input.k(); + input_nr = input.nr(); + input_nc = input.nc(); + + // Calculate output dimensions using input dims where target is -1 + if (k_ == -1) output_k = input_k; + if (nr_ == -1) output_nr = input_nr; + if (nc_ == -1) output_nc = input_nc; + + // Check if this is well a pure reshape + long input_elements = input_k * input_nr * input_nc; + long output_elements = output_k * output_nr * output_nc; + if (input_elements != output_elements && input_k == output_k) needs_rescale = true; + DLIB_CASSERT(input_elements == output_elements || needs_rescale, + "Cannot reshape tensor of " << input_elements << + " elements into shape with " << output_elements << " elements. " << + "For spatial rescaling, the channel dimension (k) must remain constant."); + } + + template + void forward(const SUBNET& sub, resizable_tensor& output) + { + // Set the output size (always preserving batch dimension) + const tensor& input = sub.get_output(); + output.set_size(input.num_samples(), output_k, output_nr, output_nc); + + if (!needs_rescale) + { + // Create an alias of the input tensor with the output shape + alias_tensor input_alias(output.num_samples(), output_k, output_nr, output_nc); + // Get a view of the input tensor with the new shape + auto input_reshaped = input_alias(const_cast(input), 0); + // Copy the view to the output tensor + tt::copy_tensor(false, output, 0, input_reshaped, 0, input_reshaped.k()); + } + else + { + // Only spatial dimensions need to be resized + tt::resize_bilinear(output, input); + } + } + + template + void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + auto& grad = sub.get_gradient_input(); + + if (!needs_rescale) { + // Create an alias of the gradient tensor with the original input shape + alias_tensor grad_alias(grad.num_samples(), grad.k(), grad.nr(), grad.nc()); + // Get a view of the input gradient with the required shape + auto grad_reshaped = grad_alias(const_cast(gradient_input), 0); + // Copy the view to the output gradient + tt::copy_tensor(true, grad, 0, grad_reshaped, 0, grad_reshaped.k()); + } + else + { + // Only spatial dimensions were resized + tt::resize_bilinear_gradient(grad, gradient_input); + } + } + + // Mapping functions for coordinate transformations + inline dpoint map_input_to_output(const dpoint& p) const { + double scale_x = output_nc / static_cast(input_nc); + double scale_y = output_nr / static_cast(input_nr); + return dpoint(p.x() * scale_x, p.y() * scale_y); + } + inline dpoint map_output_to_input(const dpoint& p) const { + double scale_x = input_nc / static_cast(output_nc); + double scale_y = input_nr / static_cast(output_nr); + return dpoint(p.x() * scale_x, p.y() * scale_y); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const reshape_to_& item, std::ostream& out) + { + serialize("reshape_to_", out); + serialize(item.input_k, out); + serialize(item.input_nr, out); + serialize(item.input_nc, out); + serialize(item.output_k, out); + serialize(item.output_nr, out); + serialize(item.output_nc, out); + serialize(item.needs_rescale, out); + } + + friend void deserialize(reshape_to_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "reshape_to_") + throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::reshape_to_."); + deserialize(item.input_k, in); + deserialize(item.input_nr, in); + deserialize(item.input_nc, in); + deserialize(item.output_k, in); + deserialize(item.output_nr, in); + deserialize(item.output_nc, in); + deserialize(item.needs_rescale, in); + } + + friend std::ostream& operator<<(std::ostream& out, const reshape_to_& item) + { + out << "reshape_to ("; + out << "k=" << std::to_string(item.output_k); + out << ", nr=" << std::to_string(item.output_nr); + out << ", nc=" << std::to_string(item.output_nc); + out << ", mode=" << (item.needs_rescale ? "spatial_rescale" : "pure_reshape"); + out << ")"; + return out; + } + + friend void to_xml(const reshape_to_& item, std::ostream& out) + { + out << "\n"; + } + + private: + long input_k, input_nr, input_nc; // Input dimensions + long output_k, output_nr, output_nc; // Output dimensions + bool needs_rescale; + resizable_tensor params; // No trainable parameters + }; + + template + using reshape_to = add_layer, SUBNET>; + + template + using flatten = add_layer, SUBNET>; + // ---------------------------------------------------------------------------------------- template < diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h index e5f2d340e0..f0512d7d4c 100644 --- a/dlib/dnn/layers_abstract.h +++ b/dlib/dnn/layers_abstract.h @@ -1642,6 +1642,176 @@ namespace dlib > using resize_to = add_layer, SUBNET>; +// ---------------------------------------------------------------------------------------- + + template + class reshape_to_ + { + /*! + REQUIREMENTS ON TEMPLATE ARGUMENTS + - k_, nr_, and nc_ must be either -1 or greater than 0. + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. It defines a layer that reshapes or resizes an input tensor + into a different shape. The layer operates in two modes: + + 1. Pure Reshape Mode: When the total number of elements in the input tensor + equals the total number of elements in the output tensor, this layer + performs a simple reshaping operation without changing the values. + + 2. Spatial Rescaling Mode: When the channel dimension (k) remains constant + but the total number of elements changes, this layer performs bilinear + interpolation to resize the spatial dimensions while preserving the + channel information. + + The dimensions of the output tensor are determined by the template parameters: + - If k_ is -1, the output tensor will have the same number of channels as the input. + - If nr_ is -1, the output tensor will have the same number of rows as the input. + - If nc_ is -1, the output tensor will have the same number of columns as the input. + + Setting a value of -1 for any dimension means "keep the original dimension from the input." + + Note that this layer will throw an exception if you attempt to change both the + channel count (k) and the total number of elements. Either: + - Keep the total number of elements the same (Pure Reshape Mode), or + - Keep the channel count the same and only change spatial dimensions (Spatial Rescaling Mode) + !*/ + + public: + explicit reshape_to_(); + /*! + ensures + - #get_output_k() == k_ + - #get_output_nr() == nr_ + - #get_output_nc() == nc_ + !*/ + + long get_output_k() const; + /*! + ensures + - Returns the number of channels in the output tensor. If this value is -1, + then the output will have the same number of channels as the input. + !*/ + + long get_output_nr() const; + /*! + ensures + - Returns the number of rows in the output tensor. If this value is -1, + then the output will have the same number of rows as the input. + !*/ + + long get_output_nc() const; + /*! + ensures + - Returns the number of columns in the output tensor. If this value is -1, + then the output will have the same number of columns as the input. + !*/ + + void set_output_k(long k); + /*! + requires + - k == -1 || k > 0 + ensures + - #get_output_k() == k + !*/ + + void set_output_nr(long nr); + /*! + requires + - nr == -1 || nr > 0 + ensures + - #get_output_nr() == nr + !*/ + + void set_output_nc(long nc); + /*! + requires + - nc == -1 || nc > 0 + ensures + - #get_output_nc() == nc + !*/ + + template void setup(const SUBNET& sub); + /*! + requires + - SUBNET implements the SUBNET interface defined at the top of this file. + ensures + - Configures this layer to operate on the output of sub. + - If the total number of elements in the input tensor doesn't match the total + number of elements in the output tensor and the channel dimension is different, + an exception will be thrown. + !*/ + + template void forward(const SUBNET& sub, resizable_tensor& output); + /*! + requires + - SUBNET implements the SUBNET interface defined at the top of this file. + - setup() has been called. + ensures + - Reshapes or resizes the output of sub and stores it in #output. + - If is_spatial_rescale() == false, then performs a pure reshape operation. + - If is_spatial_rescale() == true, then performs bilinear interpolation to resize + the spatial dimensions while preserving the channel information. + - #output.num_samples() == sub.get_output().num_samples() + - #output.k() == get_output_k() if get_output_k() != -1, otherwise sub.get_output().k() + - #output.nr() == get_output_nr() if get_output_nr() != -1, otherwise sub.get_output().nr() + - #output.nc() == get_output_nc() if get_output_nc() != -1, otherwise sub.get_output().nc() + !*/ + + template void backward( + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ); + /*! + requires + - SUBNET implements the SUBNET interface defined at the top of this file. + - setup() has been called. + - gradient_input has the same dimensions as the output of forward(). + ensures + - Computes the gradients of this layer with respect to the input tensor and + parameters, and stores them in sub.get_gradient_input() and params_grad, + respectively. + - This function supports both pure reshaping and spatial rescaling operations. + !*/ + + dpoint map_input_to_output(dpoint p) const; + /*! + ensures + - Maps a point in the input tensor's coordinate system to the corresponding point + in the output tensor. This is useful for tracking how spatial locations change + through the network, especially during spatial rescaling. + !*/ + + dpoint map_output_to_input(dpoint p) const; + /*! + ensures + - Maps a point in the output tensor's coordinate system to the corresponding point + in the input tensor. This is the inverse of map_input_to_output(). + !*/ + + const tensor& get_layer_params() const; + /*! + ensures + - Returns the layer's parameters. This layer has no parameters, + so this always returns an empty tensor. + !*/ + + tensor& get_layer_params(); + /*! + ensures + - Returns the layer's parameters. This layer has no parameters, + so this always returns an empty tensor. + !*/ + }; + + template + using reshape_to = add_layer, SUBNET>; + + template + using flatten = add_layer, SUBNET>; + // ---------------------------------------------------------------------------------------- class dropout_ diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp index fae2f54d49..fc18fe95b6 100644 --- a/dlib/test/dnn.cpp +++ b/dlib/test/dnn.cpp @@ -2538,7 +2538,19 @@ void test_embeddings() embeddings_<7, 12> l; auto res = test_layer(l); DLIB_TEST_MSG(res, res); - } + } + { + print_spinner(); + reshape_to_<-1, -1, -1> l; + auto res = test_layer(l); + DLIB_TEST_MSG(res, res); + } + { + print_spinner(); + reshape_to_<-1, 3, 5> l; + auto res = test_layer(l); + DLIB_TEST_MSG(res, res); + } } // ---------------------------------------------------------------------------------------- @@ -4801,6 +4813,39 @@ void test_multm_prev() } } + void test_resize_to() { + print_spinner(); + const long nr = 8, nc = 12; + const long n_samples = 5, k = 1, h = 4; + + using net_type = tag1>>>>>; + net_type net; + + dlib::rand rnd; + std::vector> x(n_samples); + matrix xtmp(nr, nc); + for (int ii = 0; ii < n_samples; ++ii) { + for (int jj = 0; jj < nr; ++jj) + for (int kk = 0; kk < nc; ++kk) + xtmp(jj, kk) = rnd.get_random_gaussian(); + x[ii] = xtmp; + } + + resizable_tensor input_tensor; + net.to_tensor(&x[0], &x[0] + n_samples, input_tensor); + net.forward(input_tensor); + + auto& output_tensor = layer(net).get_output(); + + DLIB_TEST(output_tensor.num_samples() == input_tensor.num_samples()); + DLIB_TEST(output_tensor.k() == input_tensor.k()); + DLIB_TEST(output_tensor.nr() == input_tensor.nr()); + DLIB_TEST(output_tensor.nc() == input_tensor.nc()); + DLIB_TEST(max(abs(mat(output_tensor) - mat(input_tensor))) < 1e-5); + } + // ---------------------------------------------------------------------------------------- template @@ -5168,6 +5213,7 @@ void test_multm_prev() test_embeddings(); test_tril(); test_basic_tensor_ops(); + test_resize_to(); test_layers(); test_visit_functions(); test_copy_tensor_cpu(); From 26a29603e76ef62cfa953aff11b89951eaf017f4 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 22 May 2025 11:10:07 +0200 Subject: [PATCH 05/21] Missing update to "visitors.h" --- dlib/dnn/visitors.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/dlib/dnn/visitors.h b/dlib/dnn/visitors.h index 589e3556ef..6f0920369b 100644 --- a/dlib/dnn/visitors.h +++ b/dlib/dnn/visitors.h @@ -797,6 +797,15 @@ namespace dlib update(i); } + template + void operator()(size_t i, const add_layer, U, E>& l) + { + start_node(i, "linear"); + out << " | { outputs |{" << l.layer_details().get_num_outputs() << "}}"; + end_node(); + update(i); + } + template void operator()(size_t i, const add_layer&) { @@ -1031,6 +1040,17 @@ namespace dlib update(i); } + template + void operator()(size_t i, const add_layer, U, E>&) + { + start_node(i, "reshape_to"); + out << " | {k|{" << (k != -1 ? k : "unchanged") << "}}"; + out << " | {nr|{" << (nr != -1 ? nr : "unchanged") << "}}"; + out << " | {nc|{" << (nc != -1 ? nc : "unchanged") << "}}"; + end_node(); + update(i); + } + template void operator()(size_t i, const add_layer&) { From c9a1ee4d096e0712f31d1e786f05c707d2db6819 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 22 May 2025 11:28:22 +0200 Subject: [PATCH 06/21] format fixing for reshape_to --- dlib/dnn/visitors.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/dlib/dnn/visitors.h b/dlib/dnn/visitors.h index 6f0920369b..d9f7401974 100644 --- a/dlib/dnn/visitors.h +++ b/dlib/dnn/visitors.h @@ -1043,10 +1043,13 @@ namespace dlib template void operator()(size_t i, const add_layer, U, E>&) { - start_node(i, "reshape_to"); - out << " | {k|{" << (k != -1 ? k : "unchanged") << "}}"; - out << " | {nr|{" << (nr != -1 ? nr : "unchanged") << "}}"; - out << " | {nc|{" << (nc != -1 ? nc : "unchanged") << "}}"; + start_node(i, "reshape_to"); + if (k == -1) out << " | {k|{unchanged}}"; + else out << " | {k|{" << k << "}}"; + if (nr == -1) out << " | {nr|{unchanged}}"; + else out << " | {nr|{" << nr << "}}"; + if (nc == -1) out << " | {nc|{unchanged}}"; + else out << " | {nc|{" << nc << "}}"; end_node(); update(i); } From 02e62d8a36bcd4b51c6a39d87cb44d2e7cad00a5 Mon Sep 17 00:00:00 2001 From: "Davis E. King" Date: Fri, 23 May 2025 07:27:28 -0400 Subject: [PATCH 07/21] Update dlib/test/dnn.cpp --- dlib/test/dnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp index fc18fe95b6..b62a341c73 100644 --- a/dlib/test/dnn.cpp +++ b/dlib/test/dnn.cpp @@ -5213,7 +5213,7 @@ void test_multm_prev() test_embeddings(); test_tril(); test_basic_tensor_ops(); - test_resize_to(); + test_resize_to(); test_layers(); test_visit_functions(); test_copy_tensor_cpu(); From 778bfc1907be274b3b3677e2f63876efe5f8733e Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 29 May 2025 12:31:06 +0200 Subject: [PATCH 08/21] Vocabulary size fixed for learning, and function added for transformation-free tokenization --- dlib/tokenizer/bpe_tokenizer.h | 130 ++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 42 deletions(-) diff --git a/dlib/tokenizer/bpe_tokenizer.h b/dlib/tokenizer/bpe_tokenizer.h index f9457b554f..f1ae88cf43 100644 --- a/dlib/tokenizer/bpe_tokenizer.h +++ b/dlib/tokenizer/bpe_tokenizer.h @@ -20,49 +20,47 @@ namespace dlib { - constexpr size_t BPE_TOKENIZER_MAX_TOKEN_LENGTH = 8; - constexpr int BPE_TOKENIZER_BASE_VOCAB_SIZE = 256; class bpe_tokenizer { public: - bpe_tokenizer() : vocab_size(BPE_TOKENIZER_BASE_VOCAB_SIZE) + bpe_tokenizer() : vocab_size(BASE_VOCAB_SIZE) { // Initialize the base vocabulary with single bytes - for (int i = 0; i < BPE_TOKENIZER_BASE_VOCAB_SIZE; ++i) + for (int i = 0; i < BASE_VOCAB_SIZE; ++i) vocab[i] = std::vector{ static_cast(i) }; // Initialize special tokens with sequential IDs special_tokens = { - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 1}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 2}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 3}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 4}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 5}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 7}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 9}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 10}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 11}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 12}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 13}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 14}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 15}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 16}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 17}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 18}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 19}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 20}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 21}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 22}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 23}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 24}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 25}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 26}, - {"", BPE_TOKENIZER_BASE_VOCAB_SIZE + 27} + {"", BASE_VOCAB_SIZE}, + {"", BASE_VOCAB_SIZE + 1}, + {"", BASE_VOCAB_SIZE + 2}, + {"", BASE_VOCAB_SIZE + 3}, + {"", BASE_VOCAB_SIZE + 4}, + {"", BASE_VOCAB_SIZE + 5}, + {"", BASE_VOCAB_SIZE + 7}, + {"", BASE_VOCAB_SIZE + 9}, + {"", BASE_VOCAB_SIZE + 10}, + {"", BASE_VOCAB_SIZE + 11}, + {"", BASE_VOCAB_SIZE + 12}, + {"", BASE_VOCAB_SIZE + 13}, + {"", BASE_VOCAB_SIZE + 14}, + {"", BASE_VOCAB_SIZE + 15}, + {"", BASE_VOCAB_SIZE + 16}, + {"", BASE_VOCAB_SIZE + 17}, + {"", BASE_VOCAB_SIZE + 18}, + {"", BASE_VOCAB_SIZE + 19}, + {"", BASE_VOCAB_SIZE + 20}, + {"", BASE_VOCAB_SIZE + 21}, + {"", BASE_VOCAB_SIZE + 22}, + {"", BASE_VOCAB_SIZE + 23}, + {"", BASE_VOCAB_SIZE + 24}, + {"", BASE_VOCAB_SIZE + 25}, + {"", BASE_VOCAB_SIZE + 26}, + {"", BASE_VOCAB_SIZE + 27} }; // Initialize the vector of special token IDs @@ -73,9 +71,9 @@ namespace dlib // Train the tokenizer on the given text void train(const std::string& text, int vocab_size, bool verbose = false) { - DLIB_CASSERT(vocab_size >= BPE_TOKENIZER_BASE_VOCAB_SIZE); + DLIB_CASSERT(vocab_size >= (BASE_VOCAB_SIZE + special_tokens.size())); this->vocab_size = vocab_size; - int num_merges = vocab_size - BPE_TOKENIZER_BASE_VOCAB_SIZE; + int num_merges = vocab_size - (BASE_VOCAB_SIZE + special_tokens.size()); // Convert text to byte IDs std::vector ids; @@ -86,25 +84,25 @@ namespace dlib auto stats = get_stats(ids); if (stats.empty()) break; - // Find the most frequent pair that does not exceed BPE_TOKENIZER_MAX_TOKEN_LENGTH + // Find the most frequent pair that does not exceed MAX_TOKEN_LENGTH auto pair = get_most_frequent_pair(stats); - // Check if the resulting token would exceed BPE_TOKENIZER_MAX_TOKEN_LENGTH + // Check if the resulting token would exceed MAX_TOKEN_LENGTH size_t new_token_length = vocab[pair.first].size() + vocab[pair.second].size(); - if (new_token_length > BPE_TOKENIZER_MAX_TOKEN_LENGTH) { + if (new_token_length > MAX_TOKEN_LENGTH) { if (verbose) { std::cout << "\r" << std::setw(100) << std::flush << "\rskipping merge " << std::to_string(i + 1) << "/" << std::to_string(num_merges) << ": (" << std::to_string(pair.first) << "," << std::to_string(pair.second) << ") -> new token length " - << std::to_string(new_token_length) << " exceeds limit of " << std::to_string(BPE_TOKENIZER_MAX_TOKEN_LENGTH) + << std::to_string(new_token_length) << " exceeds limit of " << std::to_string(MAX_TOKEN_LENGTH) << std::flush; } continue; // Skip this merge } - int idx = (BPE_TOKENIZER_BASE_VOCAB_SIZE + (int)special_tokens.size()) + i; + int idx = (BASE_VOCAB_SIZE + (int)special_tokens.size()) + i; ids = merge(ids, pair, idx); merges[pair] = idx; vocab[idx].insert(vocab[idx].end(), vocab[pair.first].begin(), vocab[pair.first].end()); @@ -123,7 +121,52 @@ namespace dlib } } - // Encode the given text into subword tokens + // Encode the given text into subword tokens without paragraph splitting or special token wrapping + std::vector encode_raw(const std::string& text) const + { + // Direct encoding without paragraph splitting or special tokens + std::vector ids; + ids.reserve(text.size()); + + // Convert text to character IDs + for (char c : text) ids.push_back(static_cast(c)); + + // Apply BPE merges + auto stats = get_stats(ids); + std::priority_queue>> pq; + for (const auto& stat : stats) { + const std::pair& pair = stat.first; + if (merges.count(pair)) pq.push({ merges.at(pair), pair }); + } + + while (!pq.empty()) { + const auto& top_element = pq.top(); + const std::pair& pair = top_element.second; + pq.pop(); + + bool pair_found = false; + for (size_t i = 0; i < ids.size() - 1; ++i) { + if (ids[i] == pair.first && ids[i + 1] == pair.second) { + pair_found = true; + break; + } + } + if (!pair_found) continue; + + int idx = merges.at(pair); + ids = merge(ids, pair, idx); + + stats = get_stats(ids); + for (const auto& stat : stats) { + const std::pair& new_pair = stat.first; + if (merges.count(new_pair)) pq.push({ merges.at(new_pair), new_pair }); + } + } + + return ids; + } + + // Encode the given text into subword tokens (advanced version) std::vector encode(const std::string& text) const { std::vector result_ids; @@ -289,6 +332,9 @@ namespace dlib std::map> vocab; int vocab_size; + static const size_t MAX_TOKEN_LENGTH = 8; + static const int BASE_VOCAB_SIZE = 256; + // Get frequency statistics of adjacent token pairs struct pair_hash { template @@ -343,10 +389,10 @@ namespace dlib // Check if the new token formed by merging the pair would exceed the maximum allowed length size_t new_token_length = vocab.at(pair.first).size() + vocab.at(pair.second).size(); - if (new_token_length > BPE_TOKENIZER_MAX_TOKEN_LENGTH) continue; // Skip this pair if it exceeds the maximum token length + if (new_token_length > MAX_TOKEN_LENGTH) continue; // Skip this pair if it exceeds the maximum token length // Calculate the score for this pair (frequency * length_penalty) - double score = (size_t)count * (new_token_length > (BPE_TOKENIZER_MAX_TOKEN_LENGTH / 2) ? 1.75 : 1.0); + double score = (size_t)count * (new_token_length > (MAX_TOKEN_LENGTH / 2) ? 1.75 : 1.0); // Update the best pair if the current pair has a higher score if (score > max_score) From 03aafc2cd92c231b35cd3ac0b5aa25139c5de2c2 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 29 May 2025 12:32:41 +0200 Subject: [PATCH 09/21] =?UTF-8?q?Added=20a=20new=20example=20for=20learnin?= =?UTF-8?q?g=20a=20=E2=80=9Ccomplex=E2=80=9D=20Transformer=20model.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/slm_advanced_train_ex.cpp | 1293 ++++++++++++++++++++++++++++ 1 file changed, 1293 insertions(+) create mode 100644 examples/slm_advanced_train_ex.cpp diff --git a/examples/slm_advanced_train_ex.cpp b/examples/slm_advanced_train_ex.cpp new file mode 100644 index 0000000000..81b9badd72 --- /dev/null +++ b/examples/slm_advanced_train_ex.cpp @@ -0,0 +1,1293 @@ +/*! + @file slm_advanced_train_ex.cpp + @brief Transformer-based text training/generation + + This program implements a complete training and generation pipeline for a + Transformer-based text compression system. + The model features: + + 1. Rotary Positional Embeddings (RoPE) for enhanced positional encoding + 2. Multi-head self-attention with efficient memory handling + 3. Mixture-of-Experts architecture for specialized processing + 4. BPE tokenization with custom vocabulary + 5. Full training/generation/verification workflow + + Key capabilities demonstrated: + - Perfect memorization and reproduction of training text + - Efficient autoregressive generation + - Byte-level verification of reconstructed text + + References: + [1] Vaswani et al., "Attention Is All You Need" (Transformer architecture) + arXiv:1706.03762 + [2] Su et al., "RoFormer: Enhanced Transformer with Rotary Position Embedding" + arXiv:2104.09864 + [3] Shazeer et al., "Outrageously Large Neural Networks: The Sparsely-Gated + Mixture-of-Experts Layer" (MoE architecture) arXiv:1701.06538 + + Usage modes: + --train Train model on enwiki dataset + --generate Generate text from trained model + --verify Compare generated output with original + --tokenize-only Only perform tokenization step + + Configuration: + - Adjust template parameters in transformer_config for model architecture + - Modify training parameters in main() for optimization + - Set sequence length and memory limits according to available hardware +!*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace dlib; + +namespace ernie +{ + class rotary_positional_embedding_ { + public: + explicit rotary_positional_embedding_() = default; + + template + void setup(const SUBNET& sub) { + // Precompute the rotation angles and their trigonometric values + seq_len = sub.get_output().nr(); + d_head = sub.get_output().nc(); + compute_rotation_angles(); + precompute_trigonometric_values(); + } + + template + void forward(const SUBNET& sub, resizable_tensor& output) { + const tensor& input = sub.get_output(); + output.copy_size(input); + tt::copy_tensor(false, output, 0, input, 0, input.k()); + + // Apply rotary embedding to the output + apply_rotary_embedding(output); + } + + template + void backward( + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) { + tensor& prev = sub.get_gradient_input(); + resizable_tensor grad_output; + grad_output.copy_size(gradient_input); + tt::copy_tensor(false, grad_output, 0, gradient_input, 0, gradient_input.k()); + + // Apply the inverse rotation to the gradient (transpose of the rotation matrix) + apply_rotary_embedding(grad_output, true); + tt::copy_tensor(true, prev, 0, grad_output, 0, grad_output.k()); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const rotary_positional_embedding_& item, std::ostream& out) { + serialize("rotary_positional_embedding_", out); + serialize(item.seq_len, out); + serialize(item.d_head, out); + serialize(item.angles, out); + serialize(item.cos_values, out); + serialize(item.sin_values, out); + } + + friend void deserialize(rotary_positional_embedding_& item, std::istream& in) { + std::string version; + deserialize(version, in); + if (version != "rotary_positional_embedding_") + throw serialization_error("Unexpected version found while deserializing rotary_positional_embedding_."); + deserialize(item.seq_len, in); + deserialize(item.d_head, in); + deserialize(item.angles, in); + deserialize(item.cos_values, in); + deserialize(item.sin_values, in); + } + + friend std::ostream& operator<<(std::ostream& out, const rotary_positional_embedding_& item) { + out << "rotary_positional_embedding"; + out << " (d_head=" << item.d_head << ", seq_len=" << item.seq_len << ")"; + return out; + } + + friend void to_xml(const rotary_positional_embedding_& item, std::ostream& out) + { + out << "\n"; + } + + protected: + void compute_rotation_angles() { + // Following the original RoPE paper formulation + const float base = 10000.0f; + const long half_dim = d_head / 2; + angles.set_size(seq_len, half_dim); + + for (long pos = 0; pos < seq_len; ++pos) { + for (long i = 0; i < half_dim; ++i) { + float inv_freq = std::pow(base, -2.0f * (i + 0.5f) / d_head); + angles(pos, i) = pos * inv_freq; + } + } + } + + void precompute_trigonometric_values() { + // Precompute cos and sin for all angles + cos_values.set_size(angles.nr(), angles.nc()); + sin_values.set_size(angles.nr(), angles.nc()); + + for (long i = 0; i < angles.size(); ++i) { + cos_values(i) = std::cos(angles(i)); + sin_values(i) = std::sin(angles(i)); + } + } + + template + void apply_rotary_embedding( + tensor_type& x, + bool is_backward = false + ) const { + const long batch_size = x.num_samples(); + const long num_heads = x.k(); + const long seq_length = x.nr(); + const long dim = x.nc(); + const bool is_odd = (dim % 2 != 0); + const long rot_dim = is_odd ? dim - 1 : dim; + + DLIB_CASSERT(dim == d_head, "Input dimension must match d_head param"); + DLIB_CASSERT(seq_length == seq_len, "Sequence length must match seq_len param"); + + auto* ptr = x.host(); + const long stride = seq_length * dim; + + for (long n = 0; n < batch_size; ++n) { + for (long h = 0; h < num_heads; ++h) { + auto* x_ptr = ptr + (n * num_heads + h) * stride; + + for (long pos = 0; pos < seq_length; ++pos) { + const float* cos = &cos_values(pos, 0); + const float* sin = &sin_values(pos, 0); + + for (long i = 0; i < rot_dim; i += 2) { + const float x0 = x_ptr[pos * dim + i]; + const float x1 = x_ptr[pos * dim + i + 1]; + + if (!is_backward) { + x_ptr[pos * dim + i] = x0 * cos[i / 2] - x1 * sin[i / 2]; + x_ptr[pos * dim + i + 1] = x0 * sin[i / 2] + x1 * cos[i / 2]; + } + else { + x_ptr[pos * dim + i] = x0 * cos[i / 2] + x1 * sin[i / 2]; + x_ptr[pos * dim + i + 1] = -x0 * sin[i / 2] + x1 * cos[i / 2]; + } + } + } + } + } + } + + private: + long seq_len, d_head; // Sequence length and dimension of each head + matrix angles; // Precomputed rotation angles (seq_len x d_head/2) + matrix cos_values; // Precomputed cosine values + matrix sin_values; // Precomputed sine values + resizable_tensor params; // Empty tensor (no learnable parameters) + }; + + // Helper to easily add RoPE to a network + template + using rope = add_layer; + + template + class scale_weights_ : public multiply_ { + public: + explicit scale_weights_() : multiply_(1.0f / std::sqrt(static_cast(d_k_))) {} + }; + + template + using scale_weights = add_layer, SUBNET>; + + // Attention mechanism component extractors + template + using query = reshape_to>; + + template + using key = reshape_to>; + + template + using value = reshape_to>; + + /*! + This layer implements multi-head self-attention. + + Template parameters: + - ACT: Activation function type + - DO: Dropout layer type for regularization + - d_model: Model dimension (must be divisible by num_heads) + - num_heads: Number of attention heads + !*/ + template