From 3e9b9f14290e034e6ddcf76c71b4f4182b377520 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Mon, 28 Apr 2025 22:10:06 +0200
Subject: [PATCH 01/21] Implementation of linear_ layer for neural networks.
 This layer provides an optimized linear transformation for multi-dimensional
 inputs.

---
 dlib/dnn/layers.h          | 200 ++++++++++++++++++++++++++++
 dlib/dnn/layers_abstract.h | 260 +++++++++++++++++++++++++++++++++++++
 dlib/test/dnn.cpp          |  75 +++++++++++
 3 files changed, 535 insertions(+)
diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index 0a0c547f33..445864b4de 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2143,6 +2143,206 @@ namespace dlib
         >
     using fc_no_bias = add_layer<fc_<num_outputs,FC_NO_BIAS>, SUBNET>;
 
+// ----------------------------------------------------------------------------------------
+    
+    enum linear_bias_mode { LINEAR_HAS_BIAS = 0, LINEAR_NO_BIAS = 1 };
+
+    template <
+        unsigned long num_outputs_,
+        linear_bias_mode bias_mode_
+    >
+    class linear_
+    {
+        static_assert(num_outputs_ > 0, "The number of outputs from a linear_ layer must be > 0");
+
+    public:
+        linear_() :
+            num_outputs(num_outputs_),
+            num_inputs(0),
+            learning_rate_multiplier(1),
+            bias_mode(bias_mode_) {
+        }
+
+        double get_learning_rate_multiplier() const { return learning_rate_multiplier; }
+        void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+
+        unsigned long get_num_inputs() const { return num_inputs; }
+        unsigned long get_num_outputs() const { return num_outputs; }
+        void set_num_outputs(long num)
+        {
+            DLIB_CASSERT(num > 0);
+            if (num != (long)num_outputs)
+            {
+                DLIB_CASSERT(get_layer_params().size() == 0,
+                    "You can't change the number of filters in linear_ if the parameter tensor has already been allocated.");
+                num_outputs = num;
+            }
+        }
+        linear_bias_mode get_bias_mode() const { return bias_mode; }
+
+        template <typename SUBNET>
+        void setup(const SUBNET& sub)
+        {
+            num_inputs = sub.get_output().nc();
+            if (bias_mode == LINEAR_HAS_BIAS)
+                params.set_size(num_inputs + 1, num_outputs);
+            else
+                params.set_size(num_inputs, num_outputs);
+
+            dlib::rand rnd(std::rand());
+            randomize_parameters(params, num_inputs + num_outputs, rnd);
+            weights = alias_tensor(num_inputs, num_outputs);
+
+            if (bias_mode == LINEAR_HAS_BIAS) {
+                biases = alias_tensor(1, num_outputs);
+                biases(params, weights.size()) = 0;
+            }
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            const auto& prev_output = sub.get_output();
+            DLIB_CASSERT((long)num_inputs == sub.get_output().nc(),
+                "The size of the input tensor to this linear layer doesn't match the size the linear layer was trained with.");
+            output.set_size(prev_output.num_samples(), prev_output.k(), prev_output.nr(), num_outputs);
+
+            auto o = alias_tensor(output.num_samples() * output.k() * output.nr(), num_outputs)(output, 0);
+            auto so = alias_tensor(prev_output.num_samples() * prev_output.k() * prev_output.nr(), num_inputs)(prev_output, 0);
+
+            auto w = weights(params, 0);
+            tt::gemm(0, (tensor&)o, 1, so, false, w, false);
+
+            if (bias_mode == LINEAR_HAS_BIAS)
+            {
+                auto b = biases(params, weights.size());
+                tt::add(1, (tensor&)o, 1, b);
+            }
+        }
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        {
+            auto gi = alias_tensor(gradient_input.num_samples() * gradient_input.k() * gradient_input.nr(), num_outputs)(gradient_input, 0);
+            if (learning_rate_multiplier != 0)
+            {
+                const auto& prev_output = sub.get_output();
+                auto pw = weights(params_grad, 0);
+                auto so = alias_tensor(prev_output.num_samples() * prev_output.k() * prev_output.nr(), num_inputs)(prev_output, 0);
+                tt::gemm(0, pw, learning_rate_multiplier, so, true, gi, false);
+
+                if (bias_mode == LINEAR_HAS_BIAS)
+                {
+                    auto pb = biases(params_grad, weights.size());
+                    tt::assign_bias_gradient(pb, gi);
+                }
+            }
+
+            const auto& prev_gradient = sub.get_gradient_input();
+            auto sgi = alias_tensor(prev_gradient.num_samples() * prev_gradient.k() * prev_gradient.nr(), num_inputs)(prev_gradient, 0);
+            auto w = weights(params, 0);
+            tt::gemm(1, (tensor&)sgi, 1, gi, false, w, true);
+        }
+
+        alias_tensor_instance get_weights() { return weights(params, 0); }
+        alias_tensor_const_instance get_weights() const { return weights(params, 0); }
+        alias_tensor_instance get_biases()
+        {
+            static_assert(bias_mode == LINEAR_HAS_BIAS, "This linear_ layer doesn't have a bias vector "
+                "to be retrieved, as per template parameter 'bias_mode'.");
+            return biases(params, weights.size());
+        }
+        alias_tensor_const_instance get_biases() const
+        {
+            static_assert(bias_mode == LINEAR_HAS_BIAS, "This linear_ layer doesn't have a bias vector "
+                "to be retrieved, as per template parameter 'bias_mode'.");
+            return biases(params, weights.size());
+        }
+
+        inline dpoint map_input_to_output(const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input(const dpoint& p) const { return p; }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const linear_& item, std::ostream& out)
+        {
+            serialize("linear_", out);
+            serialize(item.num_outputs, out);
+            serialize(item.num_inputs, out);
+            serialize(item.params, out);
+            serialize(item.weights, out);
+            serialize(item.biases, out);
+            serialize((int)item.bias_mode, out);
+            serialize(item.learning_rate_multiplier, out);
+        }
+
+        friend void deserialize(linear_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version == "linear_")
+            {
+                deserialize(item.num_outputs, in);
+                deserialize(item.num_inputs, in);
+                deserialize(item.params, in);
+                deserialize(item.weights, in);
+                deserialize(item.biases, in);
+                int bmode;
+                deserialize(bmode, in);
+                item.bias_mode = static_cast<linear_bias_mode>(bmode);
+                if (bias_mode_ != item.bias_mode) throw serialization_error("Wrong bias_mode found while deserializing dlib::linear_");
+                deserialize(item.learning_rate_multiplier, in);
+            }
+            else
+            {
+                throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::linear_.");
+            }
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const linear_& item)
+        {
+            out << "linear\t (num_outputs=" << item.num_outputs;
+            if (item.bias_mode == LINEAR_HAS_BIAS)
+                out << ", bias=true";
+            else
+                out << ", bias=false";
+            out << ")";
+            out << " learning_rate_mult=" << item.learning_rate_multiplier;
+            return out;
+        }
+
+        friend void to_xml(const linear_& item, std::ostream& out)
+        {
+            out << "<linear"
+                << " num_outputs='" << item.num_outputs << "'"
+                << " bias='" << ((item.bias_mode == LINEAR_HAS_BIAS) ? "true" : "false") << "'"
+                << " learning_rate_mult='" << item.learning_rate_multiplier << "'>\n";
+            out << mat(item.params);
+            out << "</linear>\n";
+        }
+
+    private:
+        unsigned long num_inputs;
+        unsigned long num_outputs;
+        double learning_rate_multiplier;
+        linear_bias_mode bias_mode;
+        resizable_tensor params;
+        alias_tensor weights, biases;
+    };
+
+    template <
+        unsigned long num_outputs,
+        typename SUBNET
+    >
+    using linear = add_layer<linear_<num_outputs, LINEAR_HAS_BIAS>, SUBNET>;
+
+    template <
+        unsigned long num_outputs,
+        typename SUBNET
+    >
+    using linear_no_bias = add_layer<linear_<num_outputs, LINEAR_NO_BIAS>, SUBNET>;
+
 // ----------------------------------------------------------------------------------------
 
     class dropout_
diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h
index ef2de8e6fe..e5f2d340e0 100644
--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -689,6 +689,266 @@ namespace dlib
         >
     using fc_no_bias = add_layer<fc_<num_outputs,FC_NO_BIAS>, SUBNET>;
 
+    // ----------------------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------------------
+
+    enum linear_bias_mode
+    {
+        LINEAR_HAS_BIAS,
+        LINEAR_NO_BIAS
+    };
+
+    template <
+        unsigned long num_outputs,
+        linear_bias_mode bias_mode
+    >
+    class linear_
+    {
+        /*!
+            REQUIREMENTS ON num_outputs
+                num_outputs > 0
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of a linear layer, which applies a linear
+                transformation to the input data. For a layer with bias, the transformation
+                is:
+                    output = input * weights + bias
+                For a layer without bias, it's simply:
+                    output = input * weights
+
+                The input tensor can have any number of sample, k (channel), and nr (row)
+                dimensions, but the nc (column) dimension must match the number of input features.
+                The output tensor will have the same dimensions as the input tensor, except for
+                the nc dimension which will be equal to num_outputs.
+
+                This layer is similar to the fc_ layer, but optimized for the case where the
+                input and output tensors maintain the same dimensions, excluding the feature
+                dimension (nc). This makes it useful for working with multi-dimensional data.
+        !*/
+
+    public:
+        linear_(
+        );
+        /*!
+            ensures
+                - #get_num_outputs() == num_outputs
+                - #get_bias_mode() == bias_mode
+                - #get_learning_rate_multiplier() == 1
+        !*/
+
+        double get_learning_rate_multiplier(
+        ) const;
+        /*!
+            ensures
+                - returns a multiplier that will be applied to the gradient of this layer during
+                  training. This value appears as a multiplicative factor in the update rule. So
+                  if get_learning_rate_multiplier() == 1 then the learning rate will be multiplied
+                  by 1 and thus not modified. However, if get_learning_rate_multiplier() == 0.1 then
+                  the learning rate will be multiplied by 0.1, making the layer update 10 times
+                  slower than it would otherwise be.
+        !*/
+
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+
+        unsigned long get_num_inputs(
+        ) const;
+        /*!
+            ensures
+                - Returns the number of input features this layer expects.
+                - For an uninitialized layer (i.e., one that has not seen any data during setup
+                  or forward pass), this will be zero.
+        !*/
+
+        unsigned long get_num_outputs(
+        ) const;
+        /*!
+            ensures
+                - Returns the number of output features this layer produces.
+                  I.e., this value is num_outputs.
+        !*/
+
+        void set_num_outputs(
+            long num
+        );
+        /*!
+            requires
+                - num > 0
+            ensures
+                - #get_num_outputs() == num
+            throws
+                - std::runtime_error if this function is called after the layer parameters
+                  have been allocated and the new number of outputs doesn't match the
+                  previously set number of outputs.
+        !*/
+
+        linear_bias_mode get_bias_mode(
+        ) const;
+        /*!
+            ensures
+                - Returns a value indicating whether this layer has a bias term.
+                  I.e. returns bias_mode.
+        !*/
+
+        template <typename SUBNET>
+        void setup(
+            const SUBNET& sub
+        );
+        /*!
+            ensures
+                - Performs the necessary setup work to process data through this layer.
+                - Sets the input size based on the dimensions of the input tensor from sub.
+                - Allocates the parameter tensor and initializes its values.
+                - #get_num_inputs() == the number of columns in sub.get_output() (i.e., nc).
+        !*/
+
+        template <typename SUBNET>
+        void forward(
+            const SUBNET& sub,
+            resizable_tensor& output
+        );
+        /*!
+            requires
+                - setup() has been called
+                - sub.get_output().nc() == get_num_inputs()
+            ensures
+                - Applies the linear transformation to the input tensor from sub and stores
+                  the results in output.
+                - #output.num_samples() == sub.get_output().num_samples()
+                - #output.k()           == sub.get_output().k()
+                - #output.nr()          == sub.get_output().nr()
+                - #output.nc()          == get_num_outputs()
+        !*/
+
+        template <typename SUBNET>
+        void backward(
+            const tensor& gradient_input,
+            SUBNET& sub,
+            tensor& params_grad
+        );
+        /*!
+            requires
+                - setup() has been called
+                - sub.get_output().nc() == get_num_inputs()
+                - gradient_input has the same dimensions as the output of forward()
+            ensures
+                - Computes the gradients of this layer with respect to the parameters
+                  and the input tensor, and updates the corresponding gradient tensors.
+                - Updates params_grad based on the gradients of the weights
+                  and biases (if present).
+                - Updates sub's gradient_input based on the gradients of the
+                  inputs to this layer.
+        !*/
+
+        alias_tensor_instance get_weights(
+        );
+        /*!
+            requires
+                - setup() has been called
+            ensures
+                - Returns a reference to the weights matrix of this layer.
+        !*/
+
+        alias_tensor_const_instance get_weights(
+        ) const;
+        /*!
+            requires
+                - setup() has been called
+            ensures
+                - Returns a const reference to the weights matrix of this layer.
+        !*/
+
+        alias_tensor_instance get_biases(
+        );
+        /*!
+            requires
+                - bias_mode == LINEAR_HAS_BIAS
+                - setup() has been called
+            ensures
+                - Returns a reference to the bias vector of this layer.
+            throws
+                - static_assert failure if bias_mode != LINEAR_HAS_BIAS
+        !*/
+
+        alias_tensor_const_instance get_biases(
+        ) const;
+        /*!
+            requires
+                - bias_mode == LINEAR_HAS_BIAS
+                - setup() has been called
+            ensures
+                - Returns a const reference to the bias vector of this layer.
+            throws
+                - static_assert failure if bias_mode != LINEAR_HAS_BIAS
+        !*/
+
+        dpoint map_input_to_output(
+            const dpoint& p
+        ) const;
+        /*!
+            ensures
+                - Returns p, since the linear layer maintains the same spatial dimensions.
+        !*/
+
+        dpoint map_output_to_input(
+            const dpoint& p
+        ) const;
+        /*!
+            ensures
+                - Returns p, since the linear layer maintains the same spatial dimensions.
+        !*/
+
+        const tensor& get_layer_params(
+        ) const;
+        /*!
+            ensures
+                - Returns the parameters that define this layer, i.e., the weights and biases
+                  (if present) that are updated during training.
+        !*/
+
+        tensor& get_layer_params(
+        );
+        /*!
+            ensures
+                - Returns the parameters that define this layer, i.e., the weights and biases
+                  (if present) that are updated during training.
+        !*/
+
+        friend void serialize(const linear_& item, std::ostream& out);
+        friend void deserialize(linear_& item, std::istream& in);
+        /*!
+            provides serialization support
+        !*/
+    };
+
+    template <
+        unsigned long num_outputs,
+        typename SUBNET
+    >
+    using linear = add_layer<linear_<num_outputs, LINEAR_HAS_BIAS>, SUBNET>;
+    /*!
+        This is a layer that applies a linear transformation with bias to the input:
+        output = input * weights + bias
+    !*/
+
+    template <
+        unsigned long num_outputs,
+        typename SUBNET
+    >
+    using linear_no_bias = add_layer<linear_<num_outputs, LINEAR_NO_BIAS>, SUBNET>;
+    /*!
+        This is a layer that applies a linear transformation without bias to the input:
+        output = input * weights
+    !*/
+
+    // ----------------------------------------------------------------------------------------
+
 // ----------------------------------------------------------------------------------------
 
     struct num_con_outputs
diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp
index 9316a0edc6..fae2f54d49 100644
--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -2419,6 +2419,24 @@ void test_embeddings()
             auto res = test_layer(l);
             DLIB_TEST_MSG(res, res);
         }
+        {
+            print_spinner();
+            linear_<1, LINEAR_NO_BIAS> l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
+        {
+            print_spinner();
+            linear_<5, LINEAR_NO_BIAS> l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
+        {
+            print_spinner();
+            linear_<4, LINEAR_NO_BIAS> l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
         {
             print_spinner();
             relu_ l;
@@ -3527,6 +3545,62 @@ void test_multm_prev()
         DLIB_TEST_MSG(error_after < 1e-6, "Autoencoder error after training = " << error_after);
     }
 
+// ----------------------------------------------------------------------------------------
+    void test_linear()
+    {
+        print_spinner();
+
+        // Define the network
+		cout << "ICI !!!" << endl;
+        using net_type = tag2<linear_no_bias<6, tag1<input<matrix<float>>>>>;
+        net_type net;
+
+        // Input tensor
+        const int n_samples = 3, k = 1;
+        std::vector<matrix<float>> x(n_samples);
+        matrix<float> xtmp(2, 4);
+        xtmp = 1.0f, 2.0f, 3.0f, 4.0f,
+            5.0f, 6.0f, 7.0f, 8.0f;
+        x[0] = xtmp;
+        xtmp = 9.0f, 10.0f, 11.0f, 12.0f,
+            13.0f, 14.0f, 15.0f, 16.0f;
+        x[1] = xtmp;
+        xtmp = 17.0f, 18.0f, 19.0f, 20.0f,
+            21.0f, 22.0f, 23.0f, 24.0f;
+        x[2] = xtmp;
+
+        // Convert input matrix to tensor
+        resizable_tensor input_tensor;
+        net.to_tensor(&x[0], &x[0] + n_samples, input_tensor);
+        net.forward(input_tensor);
+
+        // Get the internal linear weights
+        matrix<float> w = mat(layer<tag2>(net).subnet().layer_details().get_weights());
+
+        // Theoretical calculation of the output
+        std::vector<matrix<float>> expected_outputs(n_samples);
+        for (int i = 0; i < n_samples; ++i) {
+            matrix<float> input_matrix = x[i];
+            expected_outputs[i] = input_matrix * w;
+        }
+
+        // Compare output tensor with expected output
+        auto& net_output = layer<tag2>(net).get_output();
+
+        // Display results
+        for (int i = 0; i < n_samples; ++i) {
+            matrix<float> output_sample;
+            output_sample.set_size(2, 6);
+            for (long r = 0; r < output_sample.nr(); ++r) {
+                for (long c = 0; c < output_sample.nc(); ++c) {
+                    output_sample(r, c) = net_output.host()[tensor_index(net_output, i, 0, r, c)];
+                }
+            }
+            DLIB_TEST_MSG(max(abs(output_sample - expected_outputs[i])) < 1e-5,
+                "linear layer - sample " + std::to_string(i));
+        }
+    }
+
 // ----------------------------------------------------------------------------------------
 
     void test_loss_mean_squared_per_channel_and_pixel()
@@ -5107,6 +5181,7 @@ void test_multm_prev()
             test_simple_linear_regression_with_mult_prev();
             test_multioutput_linear_regression();
             test_simple_autoencoder();
+            test_linear();
             test_loss_mean_squared_per_channel_and_pixel();
             test_loss_binary_log_per_pixel_learned_params_on_trivial_two_pixel_task();
             test_loss_binary_log_per_pixel_outputs_on_trivial_task();

From 93ead3d113535b150c97b0e2c23b293f502ac2a7 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Fri, 2 May 2025 22:12:40 +0200
Subject: [PATCH 02/21] Minor change

---
 dlib/dnn/layers.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index 445864b4de..a9b5da0ca0 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2203,8 +2203,8 @@ namespace dlib
         void forward(const SUBNET& sub, resizable_tensor& output)
         {
             const auto& prev_output = sub.get_output();
-            DLIB_CASSERT((long)num_inputs == sub.get_output().nc(),
-                "The size of the input tensor to this linear layer doesn't match the size the linear layer was trained with.");
+            DLIB_CASSERT((long)num_inputs == prev_output.nc(),
+                "The size of the input tensor to this linear layer doesn't match the size the linear layer was trained with.");            
             output.set_size(prev_output.num_samples(), prev_output.k(), prev_output.nr(), num_outputs);
 
             auto o = alias_tensor(output.num_samples() * output.k() * output.nr(), num_outputs)(output, 0);

From bf1b805ad53208762dfd8f87569b3caba7d22360 Mon Sep 17 00:00:00 2001
From: "Davis E. King" <davis685@gmail.com>
Date: Sat, 3 May 2025 10:35:10 -0400
Subject: [PATCH 03/21] Update dlib/dnn/layers.h

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 dlib/dnn/layers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index a9b5da0ca0..123d49f523 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2170,7 +2170,7 @@ namespace dlib
         unsigned long get_num_outputs() const { return num_outputs; }
         void set_num_outputs(long num)
         {
-            DLIB_CASSERT(num > 0);
+            DLIB_CASSERT(num > 0, "The number of outputs must be > 0, but num == " << num);
             if (num != (long)num_outputs)
             {
                 DLIB_CASSERT(get_layer_params().size() == 0,

From f234faaaa0337806bd4982f40eb9bc896380d41c Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Tue, 6 May 2025 17:40:06 +0200
Subject: [PATCH 04/21] Add reshape_to and flatten layers to Dlib's DNN module

---
 dlib/dnn/layers.h          | 180 +++++++++++++++++++++++++++++++++++++
 dlib/dnn/layers_abstract.h | 170 +++++++++++++++++++++++++++++++++++
 dlib/test/dnn.cpp          |  48 +++++++++-
 3 files changed, 397 insertions(+), 1 deletion(-)

diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index 123d49f523..7ec8b1a956 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -975,6 +975,186 @@ namespace dlib
         >
     using resize_to = add_layer<resize_to_<NR,NC>, SUBNET>;
     
+// ----------------------------------------------------------------------------------------
+
+    template <long k_ = -1, long nr_ = -1, long nc_ = -1>
+    class reshape_to_
+    {
+    public:
+        explicit reshape_to_() :
+            output_k(k_),
+            output_nr(nr_),
+            output_nc(nc_)
+        {
+            static_assert(k_ == -1 || k_ > 0, "Output k must be positive or -1");
+            static_assert(nr_ == -1 || nr_ > 0, "Output nr must be positive or -1");
+            static_assert(nc_ == -1 || nc_ > 0, "Output nc must be positive or -1");
+
+            input_k = input_nr = input_nc = 0;
+            needs_rescale = false;
+        }
+
+        // Getters for dimensions
+        long get_output_k() const { return output_k; }
+        long get_output_nr() const { return output_nr; }
+        long get_output_nc() const { return output_nc; }
+
+        // Setters for dimensions
+        void set_output_k(long k) {
+            DLIB_CASSERT(k == -1 || k > 0, "Output k must be positive or -1 to keep original dimension");
+            output_k = k;
+        }
+        void set_output_nr(long nr) {
+            DLIB_CASSERT(nr == -1 || nr > 0, "output nr must be positive or -1 to keep original dimension");
+            output_nr = nr;
+        }
+        void set_output_nc(long nc) {
+            DLIB_CASSERT(nc == -1 || nc > 0, "output nc must be positive or -1 to keep original dimension");
+            output_nc = nc;
+        }
+
+        template <typename SUBNET>
+        void setup(const SUBNET& sub)
+        {
+            const auto& input = sub.get_output();
+            input_k = input.k();
+            input_nr = input.nr();
+            input_nc = input.nc();
+
+            // Calculate output dimensions using input dims where target is -1
+            if (k_ == -1) output_k = input_k;
+            if (nr_ == -1) output_nr = input_nr;
+            if (nc_ == -1) output_nc = input_nc;
+
+            // Check if this is well a pure reshape
+            long input_elements = input_k * input_nr * input_nc;
+            long output_elements = output_k * output_nr * output_nc;
+            if (input_elements != output_elements && input_k == output_k) needs_rescale = true;
+            DLIB_CASSERT(input_elements == output_elements || needs_rescale,
+                "Cannot reshape tensor of " << input_elements <<
+                " elements into shape with " << output_elements << " elements. " <<
+                "For spatial rescaling, the channel dimension (k) must remain constant.");
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            // Set the output size (always preserving batch dimension)
+            const tensor& input = sub.get_output();
+            output.set_size(input.num_samples(), output_k, output_nr, output_nc);
+
+            if (!needs_rescale)
+            {
+                // Create an alias of the input tensor with the output shape
+                alias_tensor input_alias(output.num_samples(), output_k, output_nr, output_nc);
+                // Get a view of the input tensor with the new shape
+                auto input_reshaped = input_alias(const_cast<tensor&>(input), 0);
+                // Copy the view to the output tensor
+                tt::copy_tensor(false, output, 0, input_reshaped, 0, input_reshaped.k());
+            }
+            else
+            {
+                // Only spatial dimensions need to be resized
+                tt::resize_bilinear(output, input);
+            }
+        }
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+        {
+            auto& grad = sub.get_gradient_input();
+
+            if (!needs_rescale) {
+                // Create an alias of the gradient tensor with the original input shape
+                alias_tensor grad_alias(grad.num_samples(), grad.k(), grad.nr(), grad.nc());
+                // Get a view of the input gradient with the required shape
+                auto grad_reshaped = grad_alias(const_cast<tensor&>(gradient_input), 0);
+                // Copy the view to the output gradient
+                tt::copy_tensor(true, grad, 0, grad_reshaped, 0, grad_reshaped.k());
+            }
+            else
+            {
+                // Only spatial dimensions were resized
+                tt::resize_bilinear_gradient(grad, gradient_input);
+            }
+        }
+
+        // Mapping functions for coordinate transformations
+        inline dpoint map_input_to_output(const dpoint& p) const {
+            double scale_x = output_nc / static_cast<double>(input_nc);
+            double scale_y = output_nr / static_cast<double>(input_nr);
+            return dpoint(p.x() * scale_x, p.y() * scale_y);
+        }
+        inline dpoint map_output_to_input(const dpoint& p) const {
+            double scale_x = input_nc / static_cast<double>(output_nc);
+            double scale_y = input_nr / static_cast<double>(output_nr);
+            return dpoint(p.x() * scale_x, p.y() * scale_y);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const reshape_to_& item, std::ostream& out)
+        {
+            serialize("reshape_to_", out);
+            serialize(item.input_k, out);
+            serialize(item.input_nr, out);
+            serialize(item.input_nc, out);
+            serialize(item.output_k, out);
+            serialize(item.output_nr, out);
+            serialize(item.output_nc, out);
+            serialize(item.needs_rescale, out);
+        }
+
+        friend void deserialize(reshape_to_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "reshape_to_")
+                throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::reshape_to_.");
+            deserialize(item.input_k, in);
+            deserialize(item.input_nr, in);
+            deserialize(item.input_nc, in);
+            deserialize(item.output_k, in);
+            deserialize(item.output_nr, in);
+            deserialize(item.output_nc, in);
+            deserialize(item.needs_rescale, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const reshape_to_& item)
+        {
+            out << "reshape_to (";
+            out << "k=" << std::to_string(item.output_k);
+            out << ", nr=" << std::to_string(item.output_nr);
+            out << ", nc=" << std::to_string(item.output_nc);
+            out << ", mode=" << (item.needs_rescale ? "spatial_rescale" : "pure_reshape");
+            out << ")";
+            return out;
+        }
+
+        friend void to_xml(const reshape_to_& item, std::ostream& out)
+        {
+            out << "<reshape_to"
+                << " k='" << item.output_k << "'"
+                << " nr='" << item.output_nr << "'"
+                << " nc='" << item.output_nc << "'"
+                << " mode='" << (item.needs_rescale ? "spatial_rescale" : "pure_reshape") << "'"
+                << "/>\n";
+        }
+
+    private:        
+        long input_k, input_nr, input_nc;       // Input dimensions        
+		long output_k, output_nr, output_nc;    // Output dimensions        
+        bool needs_rescale;        
+        resizable_tensor params;                // No trainable parameters
+    };
+
+    template <long k, long nr, long nc, typename SUBNET>
+    using reshape_to = add_layer<reshape_to_<k, nr, nc>, SUBNET>;
+
+    template <long k, long nr, long nc, typename SUBNET>
+    using flatten = add_layer<reshape_to_<k * nr * nc, 1, 1>, SUBNET>;
+
 // ----------------------------------------------------------------------------------------
 
     template <
diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h
index e5f2d340e0..f0512d7d4c 100644
--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -1642,6 +1642,176 @@ namespace dlib
         >
     using resize_to = add_layer<resize_to_<NR,NC>, SUBNET>;
     
+// ----------------------------------------------------------------------------------------
+
+    template <long k_ = -1, long nr_ = -1, long nc_ = -1>
+    class reshape_to_
+    {
+        /*!
+            REQUIREMENTS ON TEMPLATE ARGUMENTS
+                - k_, nr_, and nc_ must be either -1 or greater than 0.
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above. It defines a layer that reshapes or resizes an input tensor
+                into a different shape. The layer operates in two modes:
+
+                1. Pure Reshape Mode: When the total number of elements in the input tensor
+                   equals the total number of elements in the output tensor, this layer
+                   performs a simple reshaping operation without changing the values.
+
+                2. Spatial Rescaling Mode: When the channel dimension (k) remains constant
+                   but the total number of elements changes, this layer performs bilinear
+                   interpolation to resize the spatial dimensions while preserving the
+                   channel information.
+
+                The dimensions of the output tensor are determined by the template parameters:
+                    - If k_ is -1, the output tensor will have the same number of channels as the input.
+                    - If nr_ is -1, the output tensor will have the same number of rows as the input.
+                    - If nc_ is -1, the output tensor will have the same number of columns as the input.
+
+                Setting a value of -1 for any dimension means "keep the original dimension from the input."
+
+                Note that this layer will throw an exception if you attempt to change both the
+                channel count (k) and the total number of elements. Either:
+                - Keep the total number of elements the same (Pure Reshape Mode), or
+                - Keep the channel count the same and only change spatial dimensions (Spatial Rescaling Mode)
+        !*/
+
+    public:
+        explicit reshape_to_();
+        /*!
+            ensures
+                - #get_output_k() == k_
+                - #get_output_nr() == nr_
+                - #get_output_nc() == nc_
+        !*/
+
+        long get_output_k() const;
+        /*!
+            ensures
+                - Returns the number of channels in the output tensor. If this value is -1,
+                  then the output will have the same number of channels as the input.
+        !*/
+
+        long get_output_nr() const;
+        /*!
+            ensures
+                - Returns the number of rows in the output tensor. If this value is -1,
+                  then the output will have the same number of rows as the input.
+        !*/
+
+        long get_output_nc() const;
+        /*!
+            ensures
+                - Returns the number of columns in the output tensor. If this value is -1,
+                  then the output will have the same number of columns as the input.
+        !*/
+
+        void set_output_k(long k);
+        /*!
+            requires
+                - k == -1 || k > 0
+            ensures
+                - #get_output_k() == k
+        !*/
+
+        void set_output_nr(long nr);
+        /*!
+            requires
+                - nr == -1 || nr > 0
+            ensures
+                - #get_output_nr() == nr
+        !*/
+
+        void set_output_nc(long nc);
+        /*!
+            requires
+                - nc == -1 || nc > 0
+            ensures
+                - #get_output_nc() == nc
+        !*/
+
+        template <typename SUBNET> void setup(const SUBNET& sub);
+        /*!
+            requires
+                - SUBNET implements the SUBNET interface defined at the top of this file.
+            ensures
+                - Configures this layer to operate on the output of sub.
+                - If the total number of elements in the input tensor doesn't match the total
+                  number of elements in the output tensor and the channel dimension is different,
+                  an exception will be thrown.
+        !*/
+
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        /*!
+            requires
+                - SUBNET implements the SUBNET interface defined at the top of this file.
+                - setup() has been called.
+            ensures
+                - Reshapes or resizes the output of sub and stores it in #output.
+                - If is_spatial_rescale() == false, then performs a pure reshape operation.
+                - If is_spatial_rescale() == true, then performs bilinear interpolation to resize
+                  the spatial dimensions while preserving the channel information.
+                - #output.num_samples() == sub.get_output().num_samples()
+                - #output.k() == get_output_k() if get_output_k() != -1, otherwise sub.get_output().k()
+                - #output.nr() == get_output_nr() if get_output_nr() != -1, otherwise sub.get_output().nr()
+                - #output.nc() == get_output_nc() if get_output_nc() != -1, otherwise sub.get_output().nc()
+        !*/
+
+        template <typename SUBNET> void backward(
+            const tensor& gradient_input,
+            SUBNET& sub,
+            tensor& params_grad
+        );
+        /*!
+            requires
+                - SUBNET implements the SUBNET interface defined at the top of this file.
+                - setup() has been called.
+                - gradient_input has the same dimensions as the output of forward().
+            ensures
+                - Computes the gradients of this layer with respect to the input tensor and
+                  parameters, and stores them in sub.get_gradient_input() and params_grad,
+                  respectively.
+                - This function supports both pure reshaping and spatial rescaling operations.
+        !*/
+
+        dpoint map_input_to_output(dpoint p) const;
+        /*!
+            ensures
+                - Maps a point in the input tensor's coordinate system to the corresponding point
+                  in the output tensor. This is useful for tracking how spatial locations change
+                  through the network, especially during spatial rescaling.
+        !*/
+
+        dpoint map_output_to_input(dpoint p) const;
+        /*!
+            ensures
+                - Maps a point in the output tensor's coordinate system to the corresponding point
+                  in the input tensor. This is the inverse of map_input_to_output().
+        !*/
+
+        const tensor& get_layer_params() const;
+        /*!
+            ensures
+                - Returns the layer's parameters. This layer has no parameters,
+                  so this always returns an empty tensor.
+        !*/
+
+        tensor& get_layer_params();
+        /*!
+            ensures
+                - Returns the layer's parameters. This layer has no parameters,
+                  so this always returns an empty tensor.
+        !*/
+    };
+
+    template <long k, long nr, long nc, typename SUBNET>
+    using reshape_to = add_layer<reshape_to_<k, nr, nc>, SUBNET>;
+
+    template <long k, long nr, long nc, typename SUBNET>
+    using flatten = add_layer<reshape_to_<k * nr, * nc, 1, 1>, SUBNET>;
+
 // ----------------------------------------------------------------------------------------
 
     class dropout_
diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp
index fae2f54d49..fc18fe95b6 100644
--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -2538,7 +2538,19 @@ void test_embeddings()
             embeddings_<7, 12> l;
             auto res = test_layer(l);
             DLIB_TEST_MSG(res, res);
-        }        
+        }
+        {
+            print_spinner();
+            reshape_to_<-1, -1, -1> l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
+        {
+            print_spinner();
+            reshape_to_<-1, 3, 5> l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
     }
 
 // ----------------------------------------------------------------------------------------
@@ -4801,6 +4813,39 @@ void test_multm_prev()
         }
     }
 
+    void test_resize_to() {
+        print_spinner();
+        const long nr = 8, nc = 12;
+        const long n_samples = 5, k = 1, h = 4;
+
+        using net_type = tag1<reshape_to<k, nr, nc,
+            flatten<h, nr, nc / h, reshape_to<h, nr, nc / h,
+            input<matrix<float>>>>>>;
+        net_type net;
+
+        dlib::rand rnd;
+        std::vector<matrix<float>> x(n_samples);
+        matrix<float> xtmp(nr, nc);
+        for (int ii = 0; ii < n_samples; ++ii) {
+            for (int jj = 0; jj < nr; ++jj)
+                for (int kk = 0; kk < nc; ++kk)
+                    xtmp(jj, kk) = rnd.get_random_gaussian();
+            x[ii] = xtmp;
+        }
+
+        resizable_tensor input_tensor;
+        net.to_tensor(&x[0], &x[0] + n_samples, input_tensor);
+        net.forward(input_tensor);
+
+        auto& output_tensor = layer<tag1>(net).get_output();
+
+        DLIB_TEST(output_tensor.num_samples() == input_tensor.num_samples());
+        DLIB_TEST(output_tensor.k() == input_tensor.k());
+        DLIB_TEST(output_tensor.nr() == input_tensor.nr());
+        DLIB_TEST(output_tensor.nc() == input_tensor.nc());
+        DLIB_TEST(max(abs(mat(output_tensor) - mat(input_tensor))) < 1e-5);
+    }
+
 // ----------------------------------------------------------------------------------------
 
     template <long num_filters, long ks, int s, typename SUBNET>
@@ -5168,6 +5213,7 @@ void test_multm_prev()
             test_embeddings();
             test_tril();
             test_basic_tensor_ops();
+			test_resize_to();
             test_layers();
             test_visit_functions();
             test_copy_tensor_cpu();

From 26a29603e76ef62cfa953aff11b89951eaf017f4 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Thu, 22 May 2025 11:10:07 +0200
Subject: [PATCH 05/21] Missing update to "visitors.h"

---
 dlib/dnn/visitors.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/dlib/dnn/visitors.h b/dlib/dnn/visitors.h
index 589e3556ef..6f0920369b 100644
--- a/dlib/dnn/visitors.h
+++ b/dlib/dnn/visitors.h
@@ -797,6 +797,15 @@ namespace dlib
                 update(i);
             }
 
+            template <unsigned long no, linear_bias_mode bm, typename U, typename E>
+            void operator()(size_t i, const add_layer<linear_<no, bm>, U, E>& l)
+            {
+                start_node(i, "linear");
+                out << " | { outputs |{" << l.layer_details().get_num_outputs() << "}}";
+                end_node();
+                update(i);
+            }
+
             template <typename U, typename E>
             void operator()(size_t i, const add_layer<dropout_, U, E>&)
             {
@@ -1031,6 +1040,17 @@ namespace dlib
                 update(i);
             }
 
+            template <long k, long nr, long nc, typename U, typename E>
+            void operator()(size_t i, const add_layer<reshape_to_<k, nr, nc>, U, E>&)
+            {
+                start_node(i, "reshape_to");                
+                out << " | {k|{" << (k != -1 ? k : "unchanged") << "}}";
+                out << " | {nr|{" << (nr != -1 ? nr : "unchanged") << "}}";
+                out << " | {nc|{" << (nc != -1 ? nc : "unchanged") << "}}";
+                end_node();
+                update(i);
+            }
+
             template <typename U, typename E>
             void operator()(size_t i, const add_layer<transpose_, U, E>&)
             {

From c9a1ee4d096e0712f31d1e786f05c707d2db6819 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Thu, 22 May 2025 11:28:22 +0200
Subject: [PATCH 06/21] format fixing for reshape_to

---
 dlib/dnn/visitors.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/dlib/dnn/visitors.h b/dlib/dnn/visitors.h
index 6f0920369b..d9f7401974 100644
--- a/dlib/dnn/visitors.h
+++ b/dlib/dnn/visitors.h
@@ -1043,10 +1043,13 @@ namespace dlib
             template <long k, long nr, long nc, typename U, typename E>
             void operator()(size_t i, const add_layer<reshape_to_<k, nr, nc>, U, E>&)
             {
-                start_node(i, "reshape_to");                
-                out << " | {k|{" << (k != -1 ? k : "unchanged") << "}}";
-                out << " | {nr|{" << (nr != -1 ? nr : "unchanged") << "}}";
-                out << " | {nc|{" << (nc != -1 ? nc : "unchanged") << "}}";
+                start_node(i, "reshape_to");
+                if (k == -1) out << " | {k|{unchanged}}";
+                else out << " | {k|{" << k << "}}";
+                if (nr == -1) out << " | {nr|{unchanged}}";
+                else out << " | {nr|{" << nr << "}}";
+                if (nc == -1) out << " | {nc|{unchanged}}";
+                else out << " | {nc|{" << nc << "}}";
                 end_node();
                 update(i);
             }

From 02e62d8a36bcd4b51c6a39d87cb44d2e7cad00a5 Mon Sep 17 00:00:00 2001
From: "Davis E. King" <davis685@gmail.com>
Date: Fri, 23 May 2025 07:27:28 -0400
Subject: [PATCH 07/21] Update dlib/test/dnn.cpp

---
 dlib/test/dnn.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp
index fc18fe95b6..b62a341c73 100644
--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -5213,7 +5213,7 @@ void test_multm_prev()
             test_embeddings();
             test_tril();
             test_basic_tensor_ops();
-			test_resize_to();
+            test_resize_to();
             test_layers();
             test_visit_functions();
             test_copy_tensor_cpu();

From 778bfc1907be274b3b3677e2f63876efe5f8733e Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Thu, 29 May 2025 12:31:06 +0200
Subject: [PATCH 08/21] Vocabulary size fixed for learning, and function added
 for transformation-free tokenization

---
 dlib/tokenizer/bpe_tokenizer.h | 130 ++++++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 42 deletions(-)

diff --git a/dlib/tokenizer/bpe_tokenizer.h b/dlib/tokenizer/bpe_tokenizer.h
index f9457b554f..f1ae88cf43 100644
--- a/dlib/tokenizer/bpe_tokenizer.h
+++ b/dlib/tokenizer/bpe_tokenizer.h
@@ -20,49 +20,47 @@
 
 namespace dlib
 {
-    constexpr size_t BPE_TOKENIZER_MAX_TOKEN_LENGTH = 8;
-    constexpr int BPE_TOKENIZER_BASE_VOCAB_SIZE = 256;
 
     class bpe_tokenizer
     {
     public:
-        bpe_tokenizer() : vocab_size(BPE_TOKENIZER_BASE_VOCAB_SIZE)
+        bpe_tokenizer() : vocab_size(BASE_VOCAB_SIZE)
         {
             // Initialize the base vocabulary with single bytes
-            for (int i = 0; i < BPE_TOKENIZER_BASE_VOCAB_SIZE; ++i)
+            for (int i = 0; i < BASE_VOCAB_SIZE; ++i)
                 vocab[i] = std::vector<uint8_t>{ static_cast<uint8_t>(i) };
             
             // Initialize special tokens with sequential IDs
             special_tokens =
             {
-                {"<text>",      BPE_TOKENIZER_BASE_VOCAB_SIZE},
-                {"</text>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 1},
-                {"<url>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 2},
-                {"</url>",      BPE_TOKENIZER_BASE_VOCAB_SIZE + 3},
-                {"<image>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 4},
-                {"</image>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 5},
-                {"<video>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 6},
-                {"</video>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 7},
-                {"<audio>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 8},
-                {"</audio>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 9},
-                {"<file>",      BPE_TOKENIZER_BASE_VOCAB_SIZE + 10},
-                {"</file>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 11},
-                {"<code>",      BPE_TOKENIZER_BASE_VOCAB_SIZE + 12},
-                {"</code>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 13},
-                {"<summary>",   BPE_TOKENIZER_BASE_VOCAB_SIZE + 14},
-                {"</summary>",  BPE_TOKENIZER_BASE_VOCAB_SIZE + 15},
-                {"<think>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 16},
-                {"</think>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 17},
-                {"<start>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 18},
-                {"<end>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 19},
-                {"<user>",      BPE_TOKENIZER_BASE_VOCAB_SIZE + 20},
-                {"<bot>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 21},
-                {"<system>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 22},
-                {"<question>",  BPE_TOKENIZER_BASE_VOCAB_SIZE + 23},
-                {"<answer>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 24},
-                {"<search>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 25},
-                {"<unk>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 26},
-                {"<pad>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 27}
+                {"<text>",      BASE_VOCAB_SIZE},
+                {"</text>",     BASE_VOCAB_SIZE + 1},
+                {"<url>",       BASE_VOCAB_SIZE + 2},
+                {"</url>",      BASE_VOCAB_SIZE + 3},
+                {"<image>",     BASE_VOCAB_SIZE + 4},
+                {"</image>",    BASE_VOCAB_SIZE + 5},
+                {"<video>",     BASE_VOCAB_SIZE + 6},
+                {"</video>",    BASE_VOCAB_SIZE + 7},
+                {"<audio>",     BASE_VOCAB_SIZE + 8},
+                {"</audio>",    BASE_VOCAB_SIZE + 9},
+                {"<file>",      BASE_VOCAB_SIZE + 10},
+                {"</file>",     BASE_VOCAB_SIZE + 11},
+                {"<code>",      BASE_VOCAB_SIZE + 12},
+                {"</code>",     BASE_VOCAB_SIZE + 13},
+                {"<summary>",   BASE_VOCAB_SIZE + 14},
+                {"</summary>",  BASE_VOCAB_SIZE + 15},
+                {"<think>",     BASE_VOCAB_SIZE + 16},
+                {"</think>",    BASE_VOCAB_SIZE + 17},
+                {"<start>",     BASE_VOCAB_SIZE + 18},
+                {"<end>",       BASE_VOCAB_SIZE + 19},
+                {"<user>",      BASE_VOCAB_SIZE + 20},
+                {"<bot>",       BASE_VOCAB_SIZE + 21},
+                {"<system>",    BASE_VOCAB_SIZE + 22},
+                {"<question>",  BASE_VOCAB_SIZE + 23},
+                {"<answer>",    BASE_VOCAB_SIZE + 24},
+                {"<search>",    BASE_VOCAB_SIZE + 25},
+                {"<unk>",       BASE_VOCAB_SIZE + 26},
+                {"<pad>",       BASE_VOCAB_SIZE + 27}
             };
 
             // Initialize the vector of special token IDs
@@ -73,9 +71,9 @@ namespace dlib
         // Train the tokenizer on the given text
         void train(const std::string& text, int vocab_size, bool verbose = false)
         {
-            DLIB_CASSERT(vocab_size >= BPE_TOKENIZER_BASE_VOCAB_SIZE);
+            DLIB_CASSERT(vocab_size >= (BASE_VOCAB_SIZE + special_tokens.size()));
             this->vocab_size = vocab_size;
-            int num_merges = vocab_size - BPE_TOKENIZER_BASE_VOCAB_SIZE;
+            int num_merges = vocab_size - (BASE_VOCAB_SIZE + special_tokens.size());
 
             // Convert text to byte IDs
             std::vector<int> ids;
@@ -86,25 +84,25 @@ namespace dlib
                 auto stats = get_stats(ids);
                 if (stats.empty()) break;
 
-                // Find the most frequent pair that does not exceed BPE_TOKENIZER_MAX_TOKEN_LENGTH
+                // Find the most frequent pair that does not exceed MAX_TOKEN_LENGTH
                 auto pair = get_most_frequent_pair(stats);
 
-                // Check if the resulting token would exceed BPE_TOKENIZER_MAX_TOKEN_LENGTH
+                // Check if the resulting token would exceed MAX_TOKEN_LENGTH
                 size_t new_token_length = vocab[pair.first].size() + vocab[pair.second].size();
-                if (new_token_length > BPE_TOKENIZER_MAX_TOKEN_LENGTH) {
+                if (new_token_length > MAX_TOKEN_LENGTH) {
                     if (verbose)
                     {
                         std::cout << "\r"
                             << std::setw(100) << std::flush
                             << "\rskipping merge " << std::to_string(i + 1) << "/" << std::to_string(num_merges) << ": ("
                             << std::to_string(pair.first) << "," << std::to_string(pair.second) << ") -> new token length "
-                            << std::to_string(new_token_length) << " exceeds limit of " << std::to_string(BPE_TOKENIZER_MAX_TOKEN_LENGTH)
+                            << std::to_string(new_token_length) << " exceeds limit of " << std::to_string(MAX_TOKEN_LENGTH)
                             << std::flush;
                     }
                     continue; // Skip this merge
                 }
 
-                int idx = (BPE_TOKENIZER_BASE_VOCAB_SIZE + (int)special_tokens.size()) + i;
+                int idx = (BASE_VOCAB_SIZE + (int)special_tokens.size()) + i;
                 ids = merge(ids, pair, idx);
                 merges[pair] = idx;
                 vocab[idx].insert(vocab[idx].end(), vocab[pair.first].begin(), vocab[pair.first].end());
@@ -123,7 +121,52 @@ namespace dlib
             }
         }
 
-        // Encode the given text into subword tokens
+        // Encode the given text into subword tokens without paragraph splitting or special token wrapping
+        std::vector<int> encode_raw(const std::string& text) const
+        {
+            // Direct encoding without paragraph splitting or special tokens
+            std::vector<int> ids;
+            ids.reserve(text.size());
+
+            // Convert text to character IDs
+            for (char c : text) ids.push_back(static_cast<uint8_t>(c));
+
+            // Apply BPE merges
+            auto stats = get_stats(ids);
+            std::priority_queue<std::pair<int, std::pair<int, int>>> pq;
+            for (const auto& stat : stats) {
+                const std::pair<int, int>& pair = stat.first;
+                if (merges.count(pair)) pq.push({ merges.at(pair), pair });
+            }
+
+            while (!pq.empty()) {
+                const auto& top_element = pq.top();
+                const std::pair<int, int>& pair = top_element.second;
+                pq.pop();
+
+                bool pair_found = false;
+                for (size_t i = 0; i < ids.size() - 1; ++i) {
+                    if (ids[i] == pair.first && ids[i + 1] == pair.second) {
+                        pair_found = true;
+                        break;
+                    }
+                }
+                if (!pair_found) continue;
+
+                int idx = merges.at(pair);
+                ids = merge(ids, pair, idx);
+
+                stats = get_stats(ids);
+                for (const auto& stat : stats) {
+                    const std::pair<int, int>& new_pair = stat.first;
+                    if (merges.count(new_pair)) pq.push({ merges.at(new_pair), new_pair });
+                }
+            }
+
+            return ids;
+        }
+
+        // Encode the given text into subword tokens (advanced version)
         std::vector<int> encode(const std::string& text) const
         {
             std::vector<int> result_ids;
@@ -289,6 +332,9 @@ namespace dlib
         std::map<int, std::vector<uint8_t>> vocab;
         int vocab_size;
 
+        static const size_t MAX_TOKEN_LENGTH = 8;
+        static const int BASE_VOCAB_SIZE = 256;
+
         // Get frequency statistics of adjacent token pairs
         struct pair_hash {
             template <class T1, class T2>
@@ -343,10 +389,10 @@ namespace dlib
 
                 // Check if the new token formed by merging the pair would exceed the maximum allowed length
                 size_t new_token_length = vocab.at(pair.first).size() + vocab.at(pair.second).size();
-                if (new_token_length > BPE_TOKENIZER_MAX_TOKEN_LENGTH) continue; // Skip this pair if it exceeds the maximum token length
+                if (new_token_length > MAX_TOKEN_LENGTH) continue; // Skip this pair if it exceeds the maximum token length
 
                 // Calculate the score for this pair (frequency * length_penalty)
-                double score = (size_t)count * (new_token_length > (BPE_TOKENIZER_MAX_TOKEN_LENGTH / 2) ? 1.75 : 1.0);
+                double score = (size_t)count * (new_token_length > (MAX_TOKEN_LENGTH / 2) ? 1.75 : 1.0);
 
                 // Update the best pair if the current pair has a higher score
                 if (score > max_score)

From 03aafc2cd92c231b35cd3ac0b5aa25139c5de2c2 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Thu, 29 May 2025 12:32:41 +0200
Subject: [PATCH 09/21] =?UTF-8?q?Added=20a=20new=20example=20for=20learnin?=
 =?UTF-8?q?g=20a=20=E2=80=9Ccomplex=E2=80=9D=20Transformer=20model.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/slm_advanced_train_ex.cpp | 1293 ++++++++++++++++++++++++++++
 1 file changed, 1293 insertions(+)
 create mode 100644 examples/slm_advanced_train_ex.cpp

diff --git a/examples/slm_advanced_train_ex.cpp b/examples/slm_advanced_train_ex.cpp
new file mode 100644
index 0000000000..81b9badd72
--- /dev/null
+++ b/examples/slm_advanced_train_ex.cpp
@@ -0,0 +1,1293 @@
+﻿/*!
+    @file slm_advanced_train_ex.cpp
+    @brief Transformer-based text training/generation
+
+    This program implements a complete training and generation pipeline for a
+    Transformer-based text compression system.
+    The model features:
+
+    1. Rotary Positional Embeddings (RoPE) for enhanced positional encoding
+    2. Multi-head self-attention with efficient memory handling
+    3. Mixture-of-Experts architecture for specialized processing
+    4. BPE tokenization with custom vocabulary
+    5. Full training/generation/verification workflow
+
+    Key capabilities demonstrated:
+    - Perfect memorization and reproduction of training text
+    - Efficient autoregressive generation
+    - Byte-level verification of reconstructed text
+
+    References:
+    [1] Vaswani et al., "Attention Is All You Need" (Transformer architecture)
+        arXiv:1706.03762
+    [2] Su et al., "RoFormer: Enhanced Transformer with Rotary Position Embedding"
+        arXiv:2104.09864
+    [3] Shazeer et al., "Outrageously Large Neural Networks: The Sparsely-Gated
+        Mixture-of-Experts Layer" (MoE architecture) arXiv:1701.06538
+
+    Usage modes:
+    --train         Train model on enwiki dataset
+    --generate      Generate text from trained model
+    --verify        Compare generated output with original
+    --tokenize-only Only perform tokenization step
+
+    Configuration:
+    - Adjust template parameters in transformer_config for model architecture
+    - Modify training parameters in main() for optimization
+    - Set sequence length and memory limits according to available hardware
+!*/
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <random>
+#include <fstream>
+#include <chrono>
+#include <algorithm>
+#include <dlib/data_io.h>
+#include <dlib/cmd_line_parser.h>
+#include <dlib/misc_api.h>
+#include <dlib/tokenizer/bpe_tokenizer.h>
+#include <dlib/dnn.h>
+
+using namespace std;
+using namespace dlib;
+
+namespace ernie
+{
+    class rotary_positional_embedding_ {
+    public:
+        explicit rotary_positional_embedding_() = default;
+
+        template <typename SUBNET>
+        void setup(const SUBNET& sub) {
+            // Precompute the rotation angles and their trigonometric values
+            seq_len = sub.get_output().nr();
+            d_head = sub.get_output().nc();
+            compute_rotation_angles();
+            precompute_trigonometric_values();
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output) {
+            const tensor& input = sub.get_output();
+            output.copy_size(input);
+            tt::copy_tensor(false, output, 0, input, 0, input.k());
+
+            // Apply rotary embedding to the output
+            apply_rotary_embedding(output);
+        }
+
+        template <typename SUBNET>
+        void backward(
+            const tensor& gradient_input,
+            SUBNET& sub,
+            tensor& params_grad
+        ) {
+            tensor& prev = sub.get_gradient_input();
+            resizable_tensor grad_output;
+            grad_output.copy_size(gradient_input);
+            tt::copy_tensor(false, grad_output, 0, gradient_input, 0, gradient_input.k());
+
+            // Apply the inverse rotation to the gradient (transpose of the rotation matrix)
+            apply_rotary_embedding(grad_output, true);
+            tt::copy_tensor(true, prev, 0, grad_output, 0, grad_output.k());
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const rotary_positional_embedding_& item, std::ostream& out) {
+            serialize("rotary_positional_embedding_", out);
+            serialize(item.seq_len, out);
+            serialize(item.d_head, out);
+            serialize(item.angles, out);
+            serialize(item.cos_values, out);
+            serialize(item.sin_values, out);
+        }
+
+        friend void deserialize(rotary_positional_embedding_& item, std::istream& in) {
+            std::string version;
+            deserialize(version, in);
+            if (version != "rotary_positional_embedding_")
+                throw serialization_error("Unexpected version found while deserializing rotary_positional_embedding_.");
+            deserialize(item.seq_len, in);
+            deserialize(item.d_head, in);
+            deserialize(item.angles, in);
+            deserialize(item.cos_values, in);
+            deserialize(item.sin_values, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const rotary_positional_embedding_& item) {
+            out << "rotary_positional_embedding";
+            out << " (d_head=" << item.d_head << ", seq_len=" << item.seq_len << ")";
+            return out;
+        }
+
+        friend void to_xml(const rotary_positional_embedding_& item, std::ostream& out)
+        {
+            out << "<rotary_positional_embedding"
+                << " d_head='" << item.d_head << "'"
+                << " seq_len='" << item.seq_len << "'"
+                << "/>\n";
+        }
+
+    protected:
+        void compute_rotation_angles() {
+            // Following the original RoPE paper formulation
+            const float base = 10000.0f;
+            const long half_dim = d_head / 2;
+            angles.set_size(seq_len, half_dim);
+
+            for (long pos = 0; pos < seq_len; ++pos) {
+                for (long i = 0; i < half_dim; ++i) {
+                    float inv_freq = std::pow(base, -2.0f * (i + 0.5f) / d_head);
+                    angles(pos, i) = pos * inv_freq;
+                }
+            }
+        }
+
+        void precompute_trigonometric_values() {
+            // Precompute cos and sin for all angles
+            cos_values.set_size(angles.nr(), angles.nc());
+            sin_values.set_size(angles.nr(), angles.nc());
+
+            for (long i = 0; i < angles.size(); ++i) {
+                cos_values(i) = std::cos(angles(i));
+                sin_values(i) = std::sin(angles(i));
+            }
+        }
+
+        template <typename tensor_type>
+        void apply_rotary_embedding(
+            tensor_type& x,
+            bool is_backward = false
+        ) const {
+            const long batch_size = x.num_samples();
+            const long num_heads = x.k();
+            const long seq_length = x.nr();
+            const long dim = x.nc();
+            const bool is_odd = (dim % 2 != 0);
+            const long rot_dim = is_odd ? dim - 1 : dim;
+
+            DLIB_CASSERT(dim == d_head, "Input dimension must match d_head param");
+            DLIB_CASSERT(seq_length == seq_len, "Sequence length must match seq_len param");
+
+            auto* ptr = x.host();
+            const long stride = seq_length * dim;
+
+            for (long n = 0; n < batch_size; ++n) {
+                for (long h = 0; h < num_heads; ++h) {
+                    auto* x_ptr = ptr + (n * num_heads + h) * stride;
+
+                    for (long pos = 0; pos < seq_length; ++pos) {
+                        const float* cos = &cos_values(pos, 0);
+                        const float* sin = &sin_values(pos, 0);
+
+                        for (long i = 0; i < rot_dim; i += 2) {
+                            const float x0 = x_ptr[pos * dim + i];
+                            const float x1 = x_ptr[pos * dim + i + 1];
+
+                            if (!is_backward) {
+                                x_ptr[pos * dim + i] = x0 * cos[i / 2] - x1 * sin[i / 2];
+                                x_ptr[pos * dim + i + 1] = x0 * sin[i / 2] + x1 * cos[i / 2];
+                            }
+                            else {
+                                x_ptr[pos * dim + i] = x0 * cos[i / 2] + x1 * sin[i / 2];
+                                x_ptr[pos * dim + i + 1] = -x0 * sin[i / 2] + x1 * cos[i / 2];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    private:
+        long seq_len, d_head;       // Sequence length and dimension of each head
+        matrix<float> angles;       // Precomputed rotation angles (seq_len x d_head/2)
+        matrix<float> cos_values;   // Precomputed cosine values
+        matrix<float> sin_values;   // Precomputed sine values
+        resizable_tensor params;    // Empty tensor (no learnable parameters)
+    };
+
+    // Helper to easily add RoPE to a network
+    template <typename SUBNET>
+    using rope = add_layer<rotary_positional_embedding_, SUBNET>;
+
+    template <long d_k_>
+    class scale_weights_ : public multiply_ {
+    public:
+        explicit scale_weights_() : multiply_(1.0f / std::sqrt(static_cast<float>(d_k_))) {}
+    };
+
+    template <long d_k, typename SUBNET>
+    using scale_weights = add_layer<scale_weights_<d_k>, SUBNET>;
+
+    // Attention mechanism component extractors
+    template <long seq_len, long d_model, long num_heads, typename SUBNET>
+    using query = reshape_to<num_heads, seq_len, d_model / num_heads, linear_no_bias<d_model, SUBNET>>;
+
+    template <long seq_len, long d_model, long num_heads, typename SUBNET>
+    using key = reshape_to<num_heads, seq_len, d_model / num_heads, linear_no_bias<d_model, SUBNET>>;
+
+    template <long seq_len, long d_model, long num_heads, typename SUBNET>
+    using value = reshape_to<num_heads, seq_len, d_model / num_heads, linear_no_bias<d_model, SUBNET>>;
+
+    /*!
+        This layer implements multi-head self-attention.
+
+        Template parameters:
+            - ACT: Activation function type
+            - DO: Dropout layer type for regularization
+            - d_model: Model dimension (must be divisible by num_heads)
+            - num_heads: Number of attention heads
+    !*/
+    template <template <typename> class ACT, template <typename> class DO,
+        long seq_len, long d_model, long num_heads, typename SUBNET>
+    using multihead_attention =
+        rms_norm<add_prev1<
+        DO<linear_no_bias<d_model, reshape_to<1, seq_len, d_model,
+        multm_prev2<softmaxm<tril_mask<
+        scale_weights<d_model / num_heads,
+        multm_prev3<
+        // Apply RoPE to queries & keys
+        rope<query<seq_len, d_model, num_heads, skip1<
+        tag3<transpose<
+        rope<key<seq_len, d_model, num_heads, skip1<
+        tag2<value<seq_len, d_model, num_heads,
+        tag1<SUBNET>>>>>>>>>>>>>>>>>>>>>;
+
+    template <template <typename> class DO, long num_experts, typename SUBNET>
+    using moe_router = softmax<fc<num_experts,
+        DO<leaky_relu<fc<16, DO<leaky_relu<fc<32,
+        DO<fc<16, SUBNET>>>>>>>>>>;
+
+    // Single expert network
+    template <template <typename> class ACT, template <typename> class DO,
+        long d_model, typename SUBNET>
+    using expert = DO<linear<d_model, ACT<DO<linear<d_model * 4, SUBNET>>>>>;
+
+    // Combines expert outputs using router probabilities
+    // Performs weighted sum of experts with residual connection
+    template <template <typename> class ACT, template <typename> class DO,
+        long d_model, typename SUBNET>
+    using weighted_sum_of_experts = add_prev<itag3,
+        mult_prev<itag1, extract<0, 1, 1, 1, skip6<         // Expert 1
+        itag1<expert<ACT, DO, d_model, iskip<
+        itag3<mult_prev<itag2, extract<1, 1, 1, 1, skip6<   // Expert 2
+        itag2<expert<ACT, DO, d_model,
+        itag0<SUBNET>>>>>>>>>>>>>>;
+
+    // Complete MoE feed-forward layer
+    template <template <typename> class ACT, template <typename> class DO,
+        long d_model, typename SUBNET>
+    using moe_feed_forward =
+        rms_norm<add_prev5<
+        weighted_sum_of_experts<ACT, DO, d_model, skip5<
+        tag6<moe_router<DO, 2,
+        tag5<SUBNET>>>>>>>;
+
+    /*!
+        This defines a standard transformer encoder block with self-attention
+        followed by a feed-forward network, each with residual connections.
+
+        Template parameters:
+            - ACT: Activation function type
+            - DO: Dropout layer type for regularization
+            - seq_len: Sequence length (number of tokens/patches)
+            - d_model: Model dimension
+            - num_heads: Number of attention heads
+    !*/
+    template <template <typename> class ACT, template <typename> class DO,
+        long seq_len, long d_model, long num_heads, typename SUBNET>
+    using transformer_block =
+        moe_feed_forward<ACT, DO, d_model,
+        multihead_attention<ACT, DO, seq_len, d_model, num_heads, SUBNET>>;
+
+    // Positional Embeddings
+    template <long num_embeddings, long embedding_length, typename SUBNET>
+    using positional_embeddings = layer_norm<positional_encodings<embeddings<num_embeddings, embedding_length, SUBNET>>>;
+
+    // Classification Head   
+    template <template <typename> class ACT, long num_logits, long embedding_length, typename SUBNET>
+    using classification_head = loss_multiclass_log<fc<num_logits, avg_pool_everything<SUBNET>>>;
+
+    /**
+     * @brief Transformer Model Configuration Template
+     *
+     * Provides a flexible and type-safe configuration mechanism for Transformer models
+     * with compile-time parameter validation and network generation.
+     *
+     * Template parameters:
+     * @param vocab_size Vocabulary size for token embedding
+     * @param num_layers Number of Transformer layers
+     * @param num_heads Number of attention heads
+     * @param embedding_dim Dimension of token embeddings
+     * @param max_seq_len Maximum sequence length
+     * @param activation_func Activation function type
+     * @param dropout_policy Dropout regularization policy
+     */
+    template <
+        long vocab_size = 5000,                                 // Default vocabulary size
+        long num_layers = 6,                                    // Default number of layers
+        long num_heads = 8,                                     // Default number of attention heads
+        long embedding_dim = 128,                               // Default embedding dimension
+        long max_seq_len = 300,                                 // Default maximum sequence length
+        template <typename> class activation_func = gelu,       // Default activation function
+        template <typename> class dropout_policy = dropout_10   // Default dropout policy
+    >
+    struct transformer_config {
+        // Core model parameters
+        static constexpr long VOCAB_SIZE = vocab_size;
+        static constexpr long NUM_LAYERS = num_layers;
+        static constexpr long NUM_HEADS = num_heads;
+        static constexpr long EMBEDDING_DIM = embedding_dim;
+        static constexpr long MAX_SEQ_LEN = max_seq_len;
+
+        /**
+         * @brief Compile-time validation of model configuration
+         *
+         * Performs static assertions to ensure valid model parameters
+         */
+        struct validation {
+            static_assert(VOCAB_SIZE > 0, "Vocabulary size must be positive");
+            static_assert(NUM_LAYERS > 0, "Number of layers must be positive");
+            static_assert(NUM_HEADS > 0, "Number of attention heads must be positive");
+            static_assert(EMBEDDING_DIM% NUM_HEADS == 0, "Embedding dimension must be divisible by number of heads");
+        };
+
+        // Network component definitions
+        template <typename SUBNET>
+        using t_projection = fc<EMBEDDING_DIM, relu<bn_fc<fc<EMBEDDING_DIM * 2, SUBNET>>>>;
+        template <typename SUBNET>
+        using i_projection = fc<EMBEDDING_DIM, relu<affine<fc<EMBEDDING_DIM * 2, SUBNET>>>>;
+
+        template <typename SUBNET>
+        using t_transformer_block =
+            transformer_block<activation_func, dropout_policy, MAX_SEQ_LEN, EMBEDDING_DIM, NUM_HEADS, SUBNET>;
+
+        template <typename SUBNET>
+        using i_transformer_block =
+            transformer_block<activation_func, multiply, MAX_SEQ_LEN, EMBEDDING_DIM, NUM_HEADS, SUBNET>;
+
+        template<bool is_training>
+        using network_type = std::conditional_t<is_training,
+            classification_head<activation_func, VOCAB_SIZE, EMBEDDING_DIM,
+            t_projection<repeat<NUM_LAYERS, t_transformer_block,
+            positional_embeddings<VOCAB_SIZE, EMBEDDING_DIM, input<matrix<int, 0, 1>>>>>>,
+            classification_head<activation_func, VOCAB_SIZE, EMBEDDING_DIM,
+            i_projection<repeat<NUM_LAYERS, i_transformer_block,
+            positional_embeddings<VOCAB_SIZE, EMBEDDING_DIM, input<matrix<int, 0, 1>>>>>>>;
+
+        struct model_info {
+            static std::string describe() {
+                std::stringstream ss;
+                ss << "ERNIE Transformer model configuration:\n"
+                    << "- vocabulary size: " << VOCAB_SIZE << "\n"
+                    << "- layers: " << NUM_LAYERS << "\n"
+                    << "- attention heads: " << NUM_HEADS << "\n"
+                    << "- embedding dimension: " << EMBEDDING_DIM << "\n"
+                    << "- sequence length: " << MAX_SEQ_LEN;
+                return ss.str();
+            }
+        };
+    };
+}
+
+// Define a cross-platform signal handling system
+namespace {
+    std::atomic<bool> g_terminate_flag(false);
+
+#ifdef _WIN32
+    // Windows-specific handler
+    BOOL WINAPI console_ctrl_handler(DWORD ctrl_type) {
+        if (ctrl_type == CTRL_C_EVENT) {
+            g_terminate_flag.store(true);
+            cout << "\nCtrl+C detected, cleaning up and closing the program..." << endl;
+            return TRUE;
+        }
+        return FALSE;
+    }
+#else
+    // Unix/Linux/macOS handler
+    void signal_handler(int signal) {
+        if (signal == SIGINT) {
+            g_terminate_flag.store(true);
+            cout << "\nCtrl+C detected, cleaning up and closing the program..." << endl;
+        }
+    }
+#endif
+
+    // Setup the interrupt handler based on platform
+    void setup_interrupt_handler() {
+#ifdef _WIN32
+        if (!SetConsoleCtrlHandler(console_ctrl_handler, TRUE)) {
+            cerr << "ERROR: Could not set control handler" << endl;
+        }
+#else
+        struct sigaction sa;
+        sa.sa_handler = signal_handler;
+        sigemptyset(&sa.sa_mask);
+        sa.sa_flags = 0;
+        sigaction(SIGINT, &sa, NULL);
+#endif
+    }
+}
+
+// Utility function to get file size
+size_t get_file_size(const std::string& filepath) {
+    std::ifstream file(filepath, std::ios::binary);
+    if (!file) return 0;
+    file.seekg(0, std::ios::end);
+    size_t file_size = file.tellg();
+    file.close();
+    return file_size;
+}
+
+// Function to generate tokens filename based on input file and size
+std::string generate_tokens_filename(const std::string& input_file, size_t max_bytes) {
+    // Extract base name from input file
+    std::string base_name = input_file;
+    size_t pos = base_name.find_last_of("/\\");
+    if (pos != std::string::npos) base_name = base_name.substr(pos + 1);
+
+    // Create filename with size information
+    std::string size_info = (max_bytes > 0) ? "partial" : "full";
+    return base_name + "." + size_info + ".tokens.bin";
+}
+
+// Function to save tokens to binary file
+bool save_tokens_to_file(const std::vector<int>& tokens, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file) {
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        return false;
+    }
+
+    // Write number of tokens
+    uint64_t num_tokens = tokens.size();
+    file.write(reinterpret_cast<const char*>(&num_tokens), sizeof(num_tokens));
+
+    // Write tokens
+    for (int token : tokens) {
+        uint32_t t = static_cast<uint32_t>(token);
+        file.write(reinterpret_cast<const char*>(&t), sizeof(t));
+    }
+    file.flush();
+    file.close();
+
+    return true;
+}
+
+// Function to load tokens from binary file
+bool load_tokens_from_file(std::vector<int>& tokens, const std::string& filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file) {
+        std::cerr << "Failed to open file for reading: " << filename << std::endl;
+        return false;
+    }
+
+    // Read number of tokens
+    uint64_t num_tokens;
+    file.read(reinterpret_cast<char*>(&num_tokens), sizeof(num_tokens));
+
+    // Read tokens
+    tokens.resize(num_tokens);
+    for (uint64_t i = 0; i < num_tokens; ++i) {
+        uint32_t t;
+        file.read(reinterpret_cast<char*>(&t), sizeof(t));
+        tokens[i] = static_cast<int>(t);
+    }
+    file.close();
+
+    return true;
+}
+
+// Function to read the "enwiki" file (entire or portion)
+std::string read_enwiki(const std::string& filepath, size_t max_bytes = 0) {
+    std::ifstream file(filepath, std::ios::binary);
+    if (!file) {
+        throw std::runtime_error("Cannot open enwiki file: " + filepath);
+    }
+    size_t file_size = get_file_size(filepath);
+
+    // If max_bytes is specified and valid, limit the reading
+    size_t bytes_to_read = (max_bytes > 0 && max_bytes < file_size) ? max_bytes : file_size;
+
+    std::string content(bytes_to_read, ' ');
+    file.read(&content[0], bytes_to_read);
+
+    return content;
+}
+
+// Function to verify byte-for-byte matching with detailed error reporting
+bool verify_match(const std::string& original, const std::string& generated) {
+    if (original.size() != generated.size()) {
+        cout << "Size mismatch: original=" << original.size()
+            << " bytes, generated=" << generated.size() << " bytes\n";
+        return false;
+    }
+
+    // Helper function to determine if a character is printable
+    auto is_printable = [](unsigned char c) { return c >= 32 && c < 127; };
+
+    // Helper function to format a byte as string (either character or hex)
+    auto format_byte = [&is_printable](unsigned char c) -> std::string {
+        if (is_printable(c)) {
+            return std::string(1, c);
+        }
+        else {
+            std::stringstream ss;
+            ss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
+            return ss.str();
+        }
+    };
+
+    // Helper function to display context around a position
+    auto show_context = [&](size_t pos, size_t context_size) {
+        size_t start = (pos >= context_size) ? pos - context_size : 0;
+        size_t end = std::min(original.size(), pos + context_size + 1);
+
+        std::string orig_context, gen_context;
+        std::string orig_highlight, gen_highlight;
+
+        for (size_t i = start; i < end; ++i) {
+            unsigned char orig_c = static_cast<unsigned char>(original[i]);
+            unsigned char gen_c = static_cast<unsigned char>(generated[i]);
+
+            orig_context += format_byte(orig_c);
+            gen_context += format_byte(gen_c);
+
+            if (i == pos) {
+                orig_highlight = format_byte(orig_c);
+                gen_highlight = format_byte(gen_c);
+            }
+        }
+
+        cout << "Context at position " << pos << ":\n";
+        cout << "Original (" << (int)original[pos] << " = '" << orig_highlight
+            << "'): " << orig_context << "\n";
+        cout << "Generated (" << (int)generated[pos] << " = '" << gen_highlight
+            << "'): " << gen_context << "\n";
+    };
+
+    size_t mismatch_count = 0;
+    const size_t max_detailed_mismatches = 10;  // Maximum number of detailed errors to display
+    const size_t context_size = 10;             // Number of characters to show before/after error
+
+    // Track error patterns
+    std::map<std::pair<char, char>, int> error_patterns;
+
+    // Analyze consecutive error regions
+    size_t current_region_start = 0;
+    size_t current_region_length = 0;
+    std::vector<std::pair<size_t, size_t>> error_regions; // (start, length)
+
+    for (size_t i = 0; i < original.size(); ++i) {
+        if (original[i] != generated[i]) {
+            // Track error pattern
+            error_patterns[{original[i], generated[i]}]++;
+
+            // Increment mismatch count
+            mismatch_count++;
+
+            // Handle error regions
+            if (current_region_length == 0) {
+                current_region_start = i;
+                current_region_length = 1;
+            }
+            else if (i == current_region_start + current_region_length) {
+                current_region_length++;
+            }
+            else {
+                // Save previous region and start new one
+                error_regions.push_back({ current_region_start, current_region_length });
+                current_region_start = i;
+                current_region_length = 1;
+            }
+
+            // Show detailed information for first few mismatches
+            if (mismatch_count <= max_detailed_mismatches) {
+                cout << "\n----- Mismatch #" << mismatch_count << " -----\n";
+                show_context(i, context_size);
+            }
+        }
+    }
+
+    // Add the last region if exists
+    if (current_region_length > 0) {
+        error_regions.push_back({ current_region_start, current_region_length });
+    }
+
+    if (mismatch_count > 0) {
+        cout << "\n===== Error Summary =====\n";
+        cout << "Total mismatches: " << mismatch_count << " bytes ("
+            << (mismatch_count * 100.0 / original.size()) << "%)\n";
+
+        // Report on error regions
+        cout << "\nFound " << error_regions.size() << " error regions:\n";
+        for (size_t i = 0; i < error_regions.size() && i < 20; ++i) {
+            cout << "  Region #" << (i + 1) << ": Position " << error_regions[i].first
+                << ", Length " << error_regions[i].second << "\n";
+        }
+        if (error_regions.size() > 20)
+            cout << "  ... and " << (error_regions.size() - 20) << " more regions\n";
+
+        // Report on most common error patterns
+        cout << "\nMost common error patterns (original -> generated):\n";
+        std::vector<std::pair<std::pair<char, char>, int>> patterns(
+            error_patterns.begin(), error_patterns.end());
+        std::sort(patterns.begin(), patterns.end(),
+            [](const auto& a, const auto& b) { return a.second > b.second; });
+
+        for (size_t i = 0; i < patterns.size() && i < 10; ++i) {
+            char orig = patterns[i].first.first;
+            char gen = patterns[i].first.second;
+            int count = patterns[i].second;
+
+            cout << "  '" << format_byte(static_cast<unsigned char>(orig)) << "' ("
+                << static_cast<int>(static_cast<unsigned char>(orig)) << ") -> '"
+                << format_byte(static_cast<unsigned char>(gen)) << "' ("
+                << static_cast<int>(static_cast<unsigned char>(gen)) << "): "
+                << count << " occurrences\n";
+        }
+
+        return false;
+    }
+
+    cout << "Files match perfectly. All " << original.size() << " bytes are identical.\n";
+    return true;
+}
+
+// ----------------------------------------------------------------------------------------
+
+int main(int argc, char** argv)
+{
+    try
+    {
+        // Setup interrupt handling for clean termination
+        setup_interrupt_handler();
+
+        command_line_parser parser;
+        parser.add_option("train", "Train a transformer model on enwiki");
+        parser.add_option("generate", "Generate enwiki from a previously trained model");
+        parser.add_option("verify", "Verify generated output against original enwiki");
+        parser.add_option("tokenize-only", "Only tokenize the input file and save tokens");
+        parser.add_option("enwiki", "Path to the enwiki file", 1);
+        parser.add_option("max-tokens", "Maximum number of tokens to load in memory", 1);
+        parser.add_option("max-bytes", "Maximum number of bytes to process from enwiki", 1);
+        parser.add_option("percent", "Percentage of enwiki to process (0-100)", 1);
+        parser.add_option("learning-rate", "Set the learning rate (default: 1e-4)", 1);
+        parser.add_option("batch-size", "Set the mini-batch size (default: 64)", 1);
+        parser.add_option("patience", "Iterations without progress before early stopping (default: 15000)", 1);
+        parser.add_option("max-epochs", "Maximum number of training epochs (default: 10)", 1);
+        parser.add_option("alpha", "Set the weight decay for Adam (default: 0.004)", 1);
+        parser.add_option("beta1", "Set Adam's first moment coefficient (default: 0.9)", 1);
+        parser.add_option("beta2", "Set Adam's second moment coefficient (default: 0.999)", 1);
+        parser.add_option("model-file", "Path for model (default: ernie_model.dat)", 1);
+        parser.add_option("output-file", "Path for output (default: enwiki_generated.txt)", 1);
+        parser.add_option("prompt-tokens", "Number of tokens for initial prompt (default: seq-len)", 1);
+        parser.add_option("tokenizer", "Path to pre-trained tokenizer (default: ernie_tokenizer.vocab)", 1);
+        parser.add_option("tokens-file", "Path to pre-tokenized tokens file (optional)", 1);
+        parser.add_option("force-tokenize", "Force tokenization even if tokens file exists");
+        parser.parse(argc, argv);
+
+        if (parser.number_of_arguments() == 0 &&
+            !parser.option("train") && !parser.option("generate") &&
+            !parser.option("verify") && !parser.option("tokenize-only"))
+        {
+            parser.print_options();
+            return 0;
+        }
+
+        // Default values
+        const double learning_rate = get_option(parser, "learning-rate", 1e-4);
+        const long batch_size = get_option(parser, "batch-size", 64);
+        const long patience = get_option(parser, "patience", 15000);
+        const long max_epochs = get_option(parser, "max-epochs", 10);
+        const double alpha = get_option(parser, "alpha", 0.004);
+        const double beta1 = get_option(parser, "beta1", 0.9);
+        const double beta2 = get_option(parser, "beta2", 0.999);
+        const std::string model_file = get_option(parser, "model-file", "ernie_model.dat");
+        const std::string output_file = get_option(parser, "output-file", "enwiki_generated.txt");
+        const std::string enwiki_path = get_option(parser, "enwiki", "enwiki");
+        const long max_seq_len = 180;
+        const long num_layers = 2;
+        const long num_heads = 6;
+        const long embedding_dim = 228;
+        const std::string tokenizer_path = get_option(parser, "tokenizer", "ernie_tokenizer.vocab");
+        // Default number of prompt tokens = input sequence length
+        const long prompt_tokens = get_option(parser, "prompt-tokens", max_seq_len);
+        const bool force_tokenize = parser.option("force-tokenize");
+        const long num_tokens = 1000;
+
+        // Calculate max bytes to process
+        size_t max_bytes = 0, max_tokens = 0;
+        if (parser.option("max-tokens"))
+            max_tokens = std::stoul(parser.option("max-tokens").argument());        
+        if (parser.option("max-bytes")) {
+            max_bytes = std::stoul(parser.option("max-bytes").argument());
+        }
+        else if (parser.option("percent")) {
+            double percent = std::stod(parser.option("percent").argument());
+            size_t file_size = get_file_size(enwiki_path);
+            if (file_size > 0) {
+                max_bytes = static_cast<size_t>(file_size * percent / 100.0);
+                cout << "Processing " << percent << "% of enwiki = " << max_bytes << " bytes\n";
+            }
+            else {
+                cerr << "Warning: Cannot determine file size for percentage calculation\n";
+            }
+        }
+
+        // Tokenizer BPE
+        bpe_tokenizer tokenizer;
+
+        // Load pre-trained tokenizer
+        if (file_exists(tokenizer_path)) {
+            cout << "Loading pre-trained tokenizer from: " << tokenizer_path << endl;
+            deserialize(tokenizer_path) >> tokenizer;
+            cout << "Tokenizer loaded successfully with vocabulary size: " << tokenizer.get_vocab_size() << endl;
+        }
+        else {
+            cout << "Pre-trained tokenizer not found at: " << tokenizer_path << endl;
+            cout << "Will train a new tokenizer if in training mode." << endl;
+        }
+
+        // Determine tokens filename
+        std::string tokens_file = parser.option("tokens-file") ?
+            parser.option("tokens-file").argument() :
+            generate_tokens_filename(enwiki_path, max_bytes);
+
+        using ernie_transformer = ernie::transformer_config<
+            num_tokens,     // vocab_size
+            num_layers,     // number of layers
+            num_heads,      // number of attention heads
+            embedding_dim,  // embedding dimension
+            max_seq_len     // maximum sequence length
+        >;
+
+        // For GPU usage (if available)
+        std::vector<int> gpus{ 0 };
+
+        // Variables to store tokens (used in multiple modes)
+        std::vector<int> full_tokens;
+        bool tokens_loaded = false;
+
+        // ----------------------------------------------------------------------------------------
+        // Tokenize-only mode
+        // ----------------------------------------------------------------------------------------
+        if (parser.option("tokenize-only")) {
+            cout << "=== TOKENIZE-ONLY MODE ===\n";
+
+            // 1) Read the enwiki file (or portion)
+            cout << "Reading enwiki file from: " << enwiki_path;
+            if (max_bytes > 0) cout << " (limited to " << max_bytes << " bytes)";
+            cout << endl;
+
+            std::string enwiki_text = read_enwiki(enwiki_path, max_bytes);
+            cout << "Read " << enwiki_text.size() << " bytes\n";
+
+            // 2) Train a new tokenizer if needed
+            if (!file_exists(tokenizer_path)) {
+                cout << "Training new BPE tokenizer with vocabulary size " << num_tokens << "...\n";
+                tokenizer.train(enwiki_text, num_tokens, true);
+                serialize(tokenizer_path) << tokenizer;
+                cout << "Tokenizer saved to " << tokenizer_path << endl;
+            }
+
+            // 3) Tokenize the full text
+            cout << "Tokenizing input text...\n";
+            auto start_time = std::chrono::high_resolution_clock::now();
+            int text_start_id = tokenizer.get_special_token_id("<text>"),
+                text_end_id = tokenizer.get_special_token_id("</text>");
+            if (text_start_id < 0 || text_end_id < 0)
+                cout << "Warning: Special tokens not found in tokenizer vocabulary.\n";
+            full_tokens.clear();
+            full_tokens.push_back(text_start_id);
+            auto encoded_tokens = tokenizer.encode_raw(enwiki_text);
+            full_tokens.insert(full_tokens.end(), encoded_tokens.begin(), encoded_tokens.end());
+            full_tokens.push_back(text_end_id);
+            auto end_time = std::chrono::high_resolution_clock::now();
+            auto tokenize_time = std::chrono::duration_cast<std::chrono::seconds>(end_time - start_time).count();
+
+            cout << "Tokenization completed in " << tokenize_time << " seconds.\n";
+            cout << "Number of tokens: " << full_tokens.size() << endl;
+
+            // 4) Save tokens
+            cout << "Saving tokens to file: " << tokens_file << endl;
+            if (save_tokens_to_file(full_tokens, tokens_file)) {
+                cout << "Tokens successfully saved.\n";
+            }
+            else {
+                cerr << "Failed to save tokens.\n";
+            }
+
+            return 0;
+        }
+
+        // ----------------------------------------------------------------------------------------
+        // Training mode
+        // ----------------------------------------------------------------------------------------
+        if (parser.option("train"))
+        {
+            cout << "=== TRAINING MODE ===\n";
+
+            // Check if we should load pre-tokenized tokens
+            if (!force_tokenize && file_exists(tokens_file)) {
+                cout << "Found pre-tokenized tokens file: " << tokens_file << endl;
+                cout << "Loading tokens from file...\n";
+                if (load_tokens_from_file(full_tokens, tokens_file)) {
+                    cout << "Loaded " << full_tokens.size() << " tokens from file.\n";
+                    if (max_tokens > 0 && max_tokens < full_tokens.size()) {
+                        full_tokens.resize(max_tokens);
+                        cout << "But limited to " << full_tokens.size() << " tokens for training.\n";
+                    }
+                    tokens_loaded = true;
+                }
+                else {
+                    cerr << "Failed to load tokens from file. Will tokenize again.\n";
+                }
+            }
+
+            if (!tokens_loaded) {
+                // 1) Read the enwiki file (or portion)
+                cout << "Reading enwiki file from: " << enwiki_path;
+                if (max_bytes > 0) cout << " (limited to " << max_bytes << " bytes)";
+                cout << endl;
+
+                std::string enwiki_text = read_enwiki(enwiki_path, max_bytes);
+                cout << "Read " << enwiki_text.size() << " bytes\n";
+
+                // 2) Train a new tokenizer if needed
+                if (!file_exists(tokenizer_path)) {
+                    cout << "Training new BPE tokenizer with vocabulary size " << num_tokens << "...\n";
+                    tokenizer.train(enwiki_text, num_tokens, true);
+                    serialize(tokenizer_path) << tokenizer;
+                    cout << "Tokenizer saved to " << tokenizer_path << endl;
+                }
+
+                // 3) Tokenize the full text
+                cout << "Tokenizing input text...\n";
+                int text_start_id = tokenizer.get_special_token_id("<text>"),
+                    text_end_id = tokenizer.get_special_token_id("</text>");
+                if (text_start_id < 0 || text_end_id < 0)
+                    cout << "Warning: Special tokens not found in tokenizer vocabulary.\n";
+                auto start_time = std::chrono::high_resolution_clock::now();
+                full_tokens.clear();
+                full_tokens.push_back(text_start_id);
+                auto encoded_tokens = tokenizer.encode_raw(enwiki_text);
+                full_tokens.insert(full_tokens.end(), encoded_tokens.begin(), encoded_tokens.end());
+                full_tokens.push_back(text_end_id);
+                auto end_time = std::chrono::high_resolution_clock::now();
+                auto tokenize_time = std::chrono::duration_cast<std::chrono::seconds>(end_time - start_time).count();
+
+                cout << "Tokenization completed in " << tokenize_time << " seconds.\n";
+                cout << "Number of tokens: " << full_tokens.size() << endl;
+
+                // Save tokens for future use
+                cout << "Saving tokens to file: " << tokens_file << endl;
+                if (save_tokens_to_file(full_tokens, tokens_file)) {
+                    cout << "Tokens successfully saved for future use.\n";
+                }
+                else {
+                    cerr << "Warning: Failed to save tokens for future use.\n";
+                }
+            }
+
+            // 4) Prepare training sequences (sliding window)
+            cout << "Preparing training sequences...\n";
+            std::vector<matrix<int, 0, 1>> samples;
+            std::vector<unsigned long> labels;
+
+            // Calculate the maximum number of sequences we can create
+            size_t num_sequences = full_tokens.size() - max_seq_len;
+            if (num_sequences <= 0) {
+                cerr << "Error: Not enough tokens to create training sequences. Need at least "
+                    << (max_seq_len + 1) << " tokens.\n";
+                return 1;
+            }
+
+            cout << "Creating training samples...\n";
+
+            // For very large datasets, using a stride can reduce training time 
+            // without significantly affecting model quality
+            size_t stride = 1;  // Default: use every possible sequence
+            const size_t max_samples = 10000000;  // Optional: limit total samples to prevent memory issues
+
+            // If dataset is very large, use adaptive stride
+            if (num_sequences > max_samples && max_samples > 0) {
+                stride = num_sequences / max_samples + 1;
+                cout << "Dataset is large. Using stride of " << stride
+                    << " to limit samples to approximately " << max_samples << "\n";
+            }
+
+            // Reserve memory for better performance
+            samples.reserve(num_sequences / stride + 1);
+            labels.reserve(num_sequences / stride + 1);
+
+            // Create training samples with stride
+            for (size_t start = 0; start < num_sequences; start += stride) {
+                matrix<int, 0, 1> seq(max_seq_len, 1);
+                for (long t = 0; t < max_seq_len; ++t) {
+                    seq(t, 0) = full_tokens[start + t];
+                }
+                samples.push_back(seq);
+                labels.push_back(full_tokens[start + max_seq_len]);
+
+                if (samples.size() % 10000 == 0) {
+                    cout << "Created " << samples.size() << " training samples ("
+                        << (start * 100 / num_sequences) << "%)...\r";
+                }
+            }
+            full_tokens.clear();
+            cout << "Created " << samples.size() << " training samples (100%)...\n";
+
+            // 5) Build and train the network
+            using net_type = ernie_transformer::network_type<true>;
+            net_type net;
+            cout << "Model architecture:\n" << ernie_transformer::model_info::describe() << endl;
+            if (file_exists(model_file)) deserialize(model_file) >> net;
+
+            // Create trainer
+            dnn_trainer<net_type, adam> trainer(net, adam(alpha, beta1, beta2), gpus);
+            trainer.set_learning_rate(learning_rate);
+            trainer.set_min_learning_rate(1e-6);
+            trainer.set_mini_batch_size(batch_size);
+            // For perfect memorization, we allow more epochs without improvement
+            trainer.set_iterations_without_progress_threshold(patience);
+            trainer.set_max_num_epochs(max_epochs); // More epochs for perfect memorization
+            trainer.set_synchronization_file("ernie_trainer.sync", std::chrono::minutes(10));
+            trainer.be_quiet();
+
+            // Custom training loop - trainer.train(samples, labels)
+            cout << "Starting training...\n";            
+            size_t epoch = 0, samples_seen = 0, batches_seen = 0;
+            double total_loss = 0;
+            auto start_time = std::chrono::steady_clock::now();
+
+            // Shuffle indices for epoch
+            std::random_device rd;
+            std::mt19937 g(rd());
+            std::vector<size_t> indices(samples.size());
+            std::iota(indices.begin(), indices.end(), 0);
+
+            while (epoch < max_epochs && trainer.get_learning_rate() >= trainer.get_min_learning_rate()
+                && !g_terminate_flag.load())
+            {
+                // Shuffle for new epoch
+                std::shuffle(indices.begin(), indices.end(), g);
+
+                // Process mini-batches
+                for (size_t i = 0; i < samples.size() && !g_terminate_flag.load(); i += batch_size)
+                {
+                    // Get current mini-batch
+                    std::vector<matrix<int, 0, 1>> batch_samples;
+                    std::vector<unsigned long> batch_labels;
+
+                    batch_samples.reserve(batch_size);
+                    batch_labels.reserve(batch_size);
+
+                    for (size_t j = 0; j < batch_size; ++j) {
+                        batch_samples.push_back(samples[indices[i + j]]);
+                        batch_labels.push_back(labels[indices[i + j]]);
+                    }
+
+                    // Train on this batch
+                    trainer.train_one_step(batch_samples, batch_labels);
+                    double loss = trainer.get_average_loss();
+
+                    // Update stats
+                    total_loss += loss;
+                    samples_seen += batch_size;
+                    batches_seen++;
+
+                    // Progress reporting
+                    if (batches_seen % 100 == 0) {
+                        auto now = std::chrono::steady_clock::now();
+                        auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(now - start_time).count();
+                        double avg_loss = total_loss / batches_seen;
+                        double samples_per_sec = samples_seen / (elapsed > 0 ? elapsed : 1);
+
+                        cout << "epoch#: " << (epoch + 1) << "/" << max_epochs
+                            << " \t batch: " << batches_seen
+                            << " \t samples: " << samples_seen
+                            << " \t loss: " << avg_loss
+                            << " \t speed: " << samples_per_sec << " samples/sec\n";
+                        cout.flush();
+                    }
+                }
+                epoch++;
+
+                // Evaluate progress at end of epoch
+                cout << ">>> completed epoch " << epoch << " - average loss: " << (total_loss / batches_seen) << endl;
+            }
+
+            // Save model
+            net.clean();
+            serialize(model_file) << net;
+            cout << "Model saved to " << model_file << "\n";
+            std::remove("ernie_trainer.sync");
+            std::remove("ernie_trainer.sync_");
+
+            // Evaluate on training set
+            if (!g_terminate_flag.load()) {
+                cout << "Evaluating model accuracy...\n";
+                using net_infer = ernie_transformer::network_type<false>;
+                net_infer g_infer = net;
+                auto predicted = g_infer(samples);
+                size_t correct = 0;
+                for (size_t i = 0; i < labels.size(); ++i)
+                    if (predicted[i] == labels[i]) correct++;
+                double accuracy = (double)correct / labels.size();
+                cout << "Training accuracy: " << accuracy << "\n";
+
+                // We need perfect accuracy to reconstruct enwiki
+                if (accuracy < 0.9999) {
+                    cout << "WARNING: Model accuracy is less than 99.99%. The model may not "
+                        << "perfectly reconstruct the input text.\n";
+                }
+            }
+        }
+
+        // ----------------------------------------------------------------------------------------
+        // Generation mode
+        // ----------------------------------------------------------------------------------------
+        if (parser.option("generate"))
+        {
+            cout << "=== GENERATION MODE ===\n";
+
+            // 1) Load the model
+            using net_infer = ernie_transformer::network_type<false>;
+            net_infer net;
+            if (file_exists(model_file)) {
+                deserialize(model_file) >> net;
+                cout << "Loaded model from " << model_file << "\n";
+            }
+            else {
+                cerr << "Error: model file not found. Please run --train first.\n";
+                return 0;
+            }
+
+            // 2) Check that tokenizer is loaded
+            if (tokenizer.get_vocab_size() == 0) {
+                cerr << "Error: Tokenizer not loaded. Please provide a valid tokenizer file.\n";
+                return 0;
+            }
+
+            // 3) Read beginning of enwiki file for prompt
+            std::vector<int> prompt_tokens;
+
+            // Check if we have pre-tokenized tokens
+            if (file_exists(tokens_file)) {
+                cout << "Found pre-tokenized tokens file: " << tokens_file << endl;
+                cout << "Loading tokens for prompt...\n";
+
+                // We only need max_seq_len tokens, so we can load
+                // just the necessary part of the file
+                std::ifstream file(tokens_file, std::ios::binary);
+                if (!file) {
+                    cerr << "Failed to open tokens file: " << tokens_file << endl;
+                }
+                else {
+                    // Read total number of tokens
+                    uint64_t num_tokens;
+                    file.read(reinterpret_cast<char*>(&num_tokens), sizeof(num_tokens));
+
+                    // Read only the first max_seq_len tokens
+                    size_t tokens_to_read = std::min(static_cast<size_t>(max_seq_len), static_cast<size_t>(num_tokens));
+                    prompt_tokens.resize(tokens_to_read);
+
+                    for (size_t i = 0; i < tokens_to_read; ++i) {
+                        uint32_t t;
+                        file.read(reinterpret_cast<char*>(&t), sizeof(t));
+                        prompt_tokens[i] = static_cast<int>(t);
+                    }
+
+                    cout << "Loaded " << prompt_tokens.size() << " tokens for prompt from file.\n";
+                }
+            }
+
+            // If we couldn't load tokens, tokenize the prompt text
+            if (prompt_tokens.empty()) {
+                cout << "Reading initial prompt from enwiki...\n";
+                std::string enwiki_prompt;
+
+                if (file_exists(enwiki_path)) {
+                    // Read a portion large enough to cover the first tokens
+                    std::ifstream file(enwiki_path, std::ios::binary);
+                    // Buffer intentionally large to ensure we have enough text for tokens
+                    char buffer[max_seq_len * 10];
+                    file.read(buffer, sizeof(buffer));
+                    size_t bytes_read = file.gcount();
+                    enwiki_prompt = std::string(buffer, bytes_read);
+                }
+                else {
+                    cerr << "Error: Cannot find original enwiki file for initial prompt.\n";
+                    return 0;
+                }
+
+                // Tokenize the prompt
+                cout << "Tokenizing prompt...\n";
+                int text_start_id = tokenizer.get_special_token_id("<text>");
+                prompt_tokens.clear();                
+                prompt_tokens.push_back(text_start_id);
+                auto encoded_tokens = tokenizer.encode_raw(enwiki_prompt);
+                prompt_tokens.insert(prompt_tokens.end(), encoded_tokens.begin(), encoded_tokens.end());
+            }
+
+            // Limit to requested number of tokens (exact, no padding)
+            if (prompt_tokens.size() > (size_t)max_seq_len) {
+                prompt_tokens.resize(max_seq_len);
+            }
+            else if (prompt_tokens.size() < (size_t)max_seq_len) {
+                cerr << "Warning: Not enough tokens in prompt. Got " << prompt_tokens.size()
+                    << ", needed " << max_seq_len << ". Consider using a larger input file.\n";
+                return 0;
+            }
+            cout << "Using " << prompt_tokens.size() << " tokens for initial prompt\n";
+
+            // 5) Put prompt in input sequence
+            matrix<int, 0, 1> input_seq(max_seq_len, 1);
+            for (long i = 0; i < max_seq_len; ++i) {
+                input_seq(i, 0) = prompt_tokens[i];
+            }
+
+            // 6) Determine text size to generate
+            size_t target_size = 0;
+            if (max_bytes > 0) {
+                target_size = max_bytes;
+            }
+            else {
+                // Default: generate 1K or original file size
+                target_size = get_file_size(enwiki_path);
+                if (target_size == 0) {
+                    target_size = 1024;
+                }
+            }
+            cout << "Will generate approximately " << target_size << " bytes\n";
+
+            // 7) Open output file
+            std::ofstream outfile(output_file, std::ios::binary);
+            if (!outfile) {
+                cerr << "Error: Cannot open output file: " << output_file << "\n";
+                return 0;
+            }
+
+            // 8) Write initial text (corresponding to prompt tokens)
+            std::string initial_text = tokenizer.decode(prompt_tokens, false);
+            outfile.write(initial_text.c_str(), initial_text.size());
+
+            // 9) Generate the rest of the text autoregressively
+            cout << "Starting autoregressive generation...\n";
+
+            // Buffer for accumulation before writing
+            std::vector<int> token_buffer;
+            const size_t buffer_size = 100;
+
+            // Save start time to measure execution time
+            auto start_time = std::chrono::high_resolution_clock::now();
+            size_t total_bytes = initial_text.size();
+            size_t token_count = prompt_tokens.size();
+
+            // Generate until target size is reached
+            int start_of_text = tokenizer.get_special_token_id("<text>"),
+                end_of_text = tokenizer.get_special_token_id("</text>"), next_token = 0;
+            while (total_bytes < target_size && next_token != start_of_text && next_token != end_of_text
+                && !g_terminate_flag.load()) {
+                // Predict next token
+                next_token = net(input_seq);
+                token_buffer.push_back(next_token);
+                token_count++;
+
+                // Shift the input window
+                for (long i = 0; i < max_seq_len - 1; ++i)
+                    input_seq(i, 0) = input_seq(i + 1, 0);
+                input_seq(max_seq_len - 1, 0) = next_token;
+
+                // If buffer is full, write to file
+                if (token_buffer.size() >= buffer_size) {
+                    std::string chunk = tokenizer.decode(token_buffer, false);
+                    outfile.write(chunk.c_str(), chunk.size());
+                    total_bytes += chunk.size();
+                    token_buffer.clear();
+
+                    // Display progress
+                    auto current_time = std::chrono::high_resolution_clock::now();
+                    auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+                        current_time - start_time).count();
+                    double tokens_per_second = (token_count - prompt_tokens.size()) / (elapsed > 0 ? elapsed : 1);
+
+                    cout << "Generated " << (token_count - prompt_tokens.size()) << " tokens, "
+                        << total_bytes << " bytes ("
+                        << (total_bytes * 100.0 / target_size) << "%) - "
+                        << tokens_per_second << " tokens/sec - "
+                        << "Est. completion: "
+                        << (int)((target_size - total_bytes) / (tokens_per_second * (chunk.size() / (double)buffer_size)))
+                        << " seconds\r";
+                }
+                if (max_tokens > 0 && token_count >= max_tokens) break;
+            }
+
+            // Flush remaining buffer
+            if (!token_buffer.empty()) {
+                std::string chunk = tokenizer.decode(token_buffer, false);
+                outfile.write(chunk.c_str(), chunk.size());
+                total_bytes += chunk.size();
+            }
+            outfile.flush();
+            outfile.close();
+
+            auto end_time = std::chrono::high_resolution_clock::now();
+            auto total_time = std::chrono::duration_cast<std::chrono::seconds>(
+                end_time - start_time).count();
+
+            cout << "Generation complete in " << total_time << " seconds!\n";
+            cout << "Generated " << (token_count - prompt_tokens.size()) << " tokens after prompt, "
+                << total_bytes << " bytes total\n";
+            cout << "Output saved to " << output_file << "\n";
+        }
+
+        // ----------------------------------------------------------------------------------------
+        // Verification mode - Compare original and generated file
+        // ----------------------------------------------------------------------------------------
+        if (parser.option("verify"))
+        {
+            cout << "=== VERIFicAtiON MODE ===\n";
+
+            if (!file_exists(enwiki_path)) {
+                cerr << "Error: Original enwiki file not found at " << enwiki_path << "\n";
+                return 0;
+            }
+
+            if (!file_exists(output_file)) {
+                cerr << "Error: Generated file not found at " << output_file << "\n";
+                return 0;
+            }
+
+            // Read generated file
+            cout << "Reading generated file...\n";
+            std::string generated = read_enwiki(output_file);
+
+            // Read the same portion of original file
+            cout << "Reading original file (same size as generated)...\n";
+            std::string original = read_enwiki(enwiki_path, generated.size());
+
+            cout << "Verifying byte-for-byte match...\n";
+            bool match = verify_match(original, generated);
+
+            if (match)
+                cout << "SUCCESS: The generated file matches the original text perfectly!\n";
+            else
+                cout << "FAILED: The generated file does not match the original text.\n";
+        }
+
+        return 0;
+    }
+    catch (exception& e)
+    {
+        cerr << "Exception thrown: " << e.what() << endl;
+        return 1;
+    }
+}
\ No newline at end of file

From 22c2561aa7734149d15e43a19b9520c95f29a06c Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Thu, 29 May 2025 12:35:02 +0200
Subject: [PATCH 10/21] =?UTF-8?q?Added=20a=20new=20example=20for=20learnin?=
 =?UTF-8?q?g=20a=20=E2=80=9Ccomplex=E2=80=9D=20Transformer=20model.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c23067879a..1232d58b09 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -147,6 +147,7 @@ add_gui_example(dnn_dcgan_train_ex)
 add_gui_example(dnn_yolo_train_ex)
 add_gui_example(dnn_self_supervised_learning_ex)
 add_example(slm_basic_train_ex)
+add_example(slm_advanced_train_ex)
 add_gui_example(3d_point_cloud_ex)
 add_example(bayes_net_ex)
 add_example(bayes_net_from_disk_ex)

From 01cd0b2b9a5a30c0079946d00c8c9bbafc0f6092 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Thu, 29 May 2025 22:41:50 +0200
Subject: [PATCH 11/21] Updated example for training a Transformer model.

---
 examples/slm_advanced_train_ex.cpp | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/examples/slm_advanced_train_ex.cpp b/examples/slm_advanced_train_ex.cpp
index 81b9badd72..e307959055 100644
--- a/examples/slm_advanced_train_ex.cpp
+++ b/examples/slm_advanced_train_ex.cpp
@@ -49,6 +49,7 @@
 #include <dlib/cmd_line_parser.h>
 #include <dlib/misc_api.h>
 #include <dlib/tokenizer/bpe_tokenizer.h>
+#include <dlib/serialize.h>
 #include <dlib/dnn.h>
 
 using namespace std;
@@ -99,24 +100,25 @@ namespace ernie
         tensor& get_layer_params() { return params; }
 
         friend void serialize(const rotary_positional_embedding_& item, std::ostream& out) {
-            serialize("rotary_positional_embedding_", out);
-            serialize(item.seq_len, out);
-            serialize(item.d_head, out);
-            serialize(item.angles, out);
-            serialize(item.cos_values, out);
-            serialize(item.sin_values, out);
+            std::string version = "rotary_positional_embedding_";
+            dlib::serialize(version, out);
+            dlib::serialize(item.seq_len, out);
+            dlib::serialize(item.d_head, out);
+            dlib::serialize(item.angles, out);
+            dlib::serialize(item.cos_values, out);
+            dlib::serialize(item.sin_values, out);
         }
 
         friend void deserialize(rotary_positional_embedding_& item, std::istream& in) {
             std::string version;
-            deserialize(version, in);
+            dlib::deserialize(version, in);
             if (version != "rotary_positional_embedding_")
                 throw serialization_error("Unexpected version found while deserializing rotary_positional_embedding_.");
-            deserialize(item.seq_len, in);
-            deserialize(item.d_head, in);
-            deserialize(item.angles, in);
-            deserialize(item.cos_values, in);
-            deserialize(item.sin_values, in);
+            dlib::deserialize(item.seq_len, in);
+            dlib::deserialize(item.d_head, in);
+            dlib::deserialize(item.angles, in);
+            dlib::deserialize(item.cos_values, in);
+            dlib::deserialize(item.sin_values, in);
         }
 
         friend std::ostream& operator<<(std::ostream& out, const rotary_positional_embedding_& item) {
@@ -1041,7 +1043,7 @@ int main(int argc, char** argv)
                 for (size_t i = 0; i < labels.size(); ++i)
                     if (predicted[i] == labels[i]) correct++;
                 double accuracy = (double)correct / labels.size();
-                cout << "Training accuracy: " << accuracy << "\n";
+                cout << "Training accuracy: " << (accuracy * 100.0) << "%\n";
 
                 // We need perfect accuracy to reconstruct enwiki
                 if (accuracy < 0.9999) {

From 6b63e551b857c912b8f49c13e2a85bae4cb37f58 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Fri, 30 May 2025 09:24:00 +0200
Subject: [PATCH 12/21] fix for gcc/ffmpeg compilation

---
 dlib/tokenizer/bpe_tokenizer.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dlib/tokenizer/bpe_tokenizer.h b/dlib/tokenizer/bpe_tokenizer.h
index f1ae88cf43..e2a1ad40a5 100644
--- a/dlib/tokenizer/bpe_tokenizer.h
+++ b/dlib/tokenizer/bpe_tokenizer.h
@@ -71,9 +71,11 @@ namespace dlib
         // Train the tokenizer on the given text
         void train(const std::string& text, int vocab_size, bool verbose = false)
         {
-            DLIB_CASSERT(vocab_size >= (BASE_VOCAB_SIZE + special_tokens.size()));
+            int current_base = static_cast<int>(BASE_VOCAB_SIZE + special_tokens.size());
+            DLIB_CASSERT(vocab_size >= current_base);
             this->vocab_size = vocab_size;
-            int num_merges = vocab_size - (BASE_VOCAB_SIZE + special_tokens.size());
+            int num_merges = vocab_size - current_base;
+            DLIB_CASSERT(num_merges > 0);
 
             // Convert text to byte IDs
             std::vector<int> ids;

From ad1f7579852cd938e8f1878ad0da588cf6e4916b Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Fri, 30 May 2025 13:31:54 +0200
Subject: [PATCH 13/21] Fix a warning message for Ubuntu compilation.

---
 dlib/dnn/layers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index 7ec8b1a956..96ad83cbc4 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2337,8 +2337,8 @@ namespace dlib
 
     public:
         linear_() :
-            num_outputs(num_outputs_),
             num_inputs(0),
+            num_outputs(num_outputs_),            
             learning_rate_multiplier(1),
             bias_mode(bias_mode_) {
         }

From c91c45aca45c86c5038de3b096d55666eb69f734 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Fri, 30 May 2025 17:36:31 +0200
Subject: [PATCH 14/21] Update for Linux environment.

---
 examples/slm_advanced_train_ex.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/examples/slm_advanced_train_ex.cpp b/examples/slm_advanced_train_ex.cpp
index e307959055..ce250c6cf9 100644
--- a/examples/slm_advanced_train_ex.cpp
+++ b/examples/slm_advanced_train_ex.cpp
@@ -45,6 +45,7 @@
 #include <fstream>
 #include <chrono>
 #include <algorithm>
+#include <csignal>
 #include <dlib/data_io.h>
 #include <dlib/cmd_line_parser.h>
 #include <dlib/misc_api.h>
@@ -428,10 +429,9 @@ namespace {
             cerr << "ERROR: Could not set control handler" << endl;
         }
 #else
-        struct sigaction sa;
-        sa.sa_handler = signal_handler;
+        struct sigaction sa {};
         sigemptyset(&sa.sa_mask);
-        sa.sa_flags = 0;
+        sa.sa_handler = signal_handler;
         sigaction(SIGINT, &sa, NULL);
 #endif
     }
@@ -689,7 +689,6 @@ int main(int argc, char** argv)
         parser.add_option("beta2", "Set Adam's second moment coefficient (default: 0.999)", 1);
         parser.add_option("model-file", "Path for model (default: ernie_model.dat)", 1);
         parser.add_option("output-file", "Path for output (default: enwiki_generated.txt)", 1);
-        parser.add_option("prompt-tokens", "Number of tokens for initial prompt (default: seq-len)", 1);
         parser.add_option("tokenizer", "Path to pre-trained tokenizer (default: ernie_tokenizer.vocab)", 1);
         parser.add_option("tokens-file", "Path to pre-tokenized tokens file (optional)", 1);
         parser.add_option("force-tokenize", "Force tokenization even if tokens file exists");
@@ -705,9 +704,9 @@ int main(int argc, char** argv)
 
         // Default values
         const double learning_rate = get_option(parser, "learning-rate", 1e-4);
-        const long batch_size = get_option(parser, "batch-size", 64);
+        const size_t batch_size = get_option(parser, "batch-size", 64);
         const long patience = get_option(parser, "patience", 15000);
-        const long max_epochs = get_option(parser, "max-epochs", 10);
+        const size_t max_epochs = get_option(parser, "max-epochs", 10);
         const double alpha = get_option(parser, "alpha", 0.004);
         const double beta1 = get_option(parser, "beta1", 0.9);
         const double beta2 = get_option(parser, "beta2", 0.999);
@@ -720,7 +719,6 @@ int main(int argc, char** argv)
         const long embedding_dim = 228;
         const std::string tokenizer_path = get_option(parser, "tokenizer", "ernie_tokenizer.vocab");
         // Default number of prompt tokens = input sequence length
-        const long prompt_tokens = get_option(parser, "prompt-tokens", max_seq_len);
         const bool force_tokenize = parser.option("force-tokenize");
         const long num_tokens = 1000;
 

From 6fcc0aa66ed4b593336151f490086c742146a90f Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Sat, 31 May 2025 11:54:56 +0200
Subject: [PATCH 15/21] Fix batch building

---
 examples/slm_advanced_train_ex.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/slm_advanced_train_ex.cpp b/examples/slm_advanced_train_ex.cpp
index ce250c6cf9..0cd9f58e3a 100644
--- a/examples/slm_advanced_train_ex.cpp
+++ b/examples/slm_advanced_train_ex.cpp
@@ -990,8 +990,9 @@ int main(int argc, char** argv)
                     batch_labels.reserve(batch_size);
 
                     for (size_t j = 0; j < batch_size; ++j) {
-                        batch_samples.push_back(samples[indices[i + j]]);
-                        batch_labels.push_back(labels[indices[i + j]]);
+                        size_t pos = (i + j) >= indices.size() ? j : (i + j);
+                        batch_samples.push_back(samples[indices[pos]]);
+                        batch_labels.push_back(labels[indices[pos]]);
                     }
 
                     // Train on this batch

From 5a1773ea9dab49805caf7ad507f4c3ca500b71e6 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Tue, 3 Jun 2025 17:48:04 +0200
Subject: [PATCH 16/21] Slight improvement in model definition.

---
 examples/slm_advanced_train_ex.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/examples/slm_advanced_train_ex.cpp b/examples/slm_advanced_train_ex.cpp
index 0cd9f58e3a..0254e1e2d4 100644
--- a/examples/slm_advanced_train_ex.cpp
+++ b/examples/slm_advanced_train_ex.cpp
@@ -262,9 +262,9 @@ namespace ernie
         tag1<SUBNET>>>>>>>>>>>>>>>>>>>>>;
 
     template <template <typename> class DO, long num_experts, typename SUBNET>
-    using moe_router = softmax<fc<num_experts,
+    using moe_router = softmax<fc<num_experts, avg_pool_everything<
         DO<leaky_relu<fc<16, DO<leaky_relu<fc<32,
-        DO<fc<16, SUBNET>>>>>>>>>>;
+        DO<fc<16, SUBNET>>>>>>>>>>>;
 
     // Single expert network
     template <template <typename> class ACT, template <typename> class DO,
@@ -968,8 +968,6 @@ int main(int argc, char** argv)
             auto start_time = std::chrono::steady_clock::now();
 
             // Shuffle indices for epoch
-            std::random_device rd;
-            std::mt19937 g(rd());
             std::vector<size_t> indices(samples.size());
             std::iota(indices.begin(), indices.end(), 0);
 
@@ -977,7 +975,7 @@ int main(int argc, char** argv)
                 && !g_terminate_flag.load())
             {
                 // Shuffle for new epoch
-                std::shuffle(indices.begin(), indices.end(), g);
+                std::shuffle(indices.begin(), indices.end(), std::default_random_engine{});
 
                 // Process mini-batches
                 for (size_t i = 0; i < samples.size() && !g_terminate_flag.load(); i += batch_size)

From 10d7c59abdebc45c13f189d168a9f1751b2c62d1 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Sat, 7 Jun 2025 18:14:41 +0200
Subject: [PATCH 17/21] linear_ layer implementation improvement

---
 dlib/dnn/layers.h | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index 96ad83cbc4..83010fe9fb 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2329,20 +2329,43 @@ namespace dlib
 
     template <
         unsigned long num_outputs_,
-        linear_bias_mode bias_mode_
+        linear_bias_mode bias_mode_ = LINEAR_HAS_BIAS
     >
     class linear_
     {
         static_assert(num_outputs_ > 0, "The number of outputs from a linear_ layer must be > 0");
 
     public:
-        linear_() :
+        explicit linear_() :
             num_inputs(0),
             num_outputs(num_outputs_),            
             learning_rate_multiplier(1),
             bias_mode(bias_mode_) {
         }
 
+        linear_(const linear_& other) :
+            num_outputs(other.num_outputs),
+            num_inputs(other.num_inputs),
+            learning_rate_multiplier(other.learning_rate_multiplier),
+            bias_mode(other.bias_mode),
+            params(other.params),
+            weights(other.weights),
+            biases(other.biases) {
+        }
+
+        linear_& operator=(const linear_& other) {
+            if (this != &other) {
+                num_outputs = other.num_outputs;
+                num_inputs = other.num_inputs;
+                learning_rate_multiplier = other.learning_rate_multiplier;
+                bias_mode = other.bias_mode;
+                params = other.params;
+                weights = other.weights;
+                biases = other.biases;
+            }
+            return *this;
+        }
+
         double get_learning_rate_multiplier() const { return learning_rate_multiplier; }
         void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
 
@@ -2515,7 +2538,7 @@ namespace dlib
         unsigned long num_outputs,
         typename SUBNET
     >
-    using linear = add_layer<linear_<num_outputs, LINEAR_HAS_BIAS>, SUBNET>;
+    using linear = add_layer<linear_<num_outputs>, SUBNET>;
 
     template <
         unsigned long num_outputs,

From d4bf94b927b10222b0e40316a9a83909fb0b788b Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Sat, 7 Jun 2025 23:09:21 +0200
Subject: [PATCH 18/21] finalizing the example

---
 examples/slm_advanced_train_ex.cpp | 77 +++++++++++++++++-------------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/examples/slm_advanced_train_ex.cpp b/examples/slm_advanced_train_ex.cpp
index 0254e1e2d4..b655b13ba1 100644
--- a/examples/slm_advanced_train_ex.cpp
+++ b/examples/slm_advanced_train_ex.cpp
@@ -56,8 +56,19 @@
 using namespace std;
 using namespace dlib;
 
-namespace ernie
+namespace dlib
 {
+    /*!
+        @class rotary_positional_embedding_
+        @brief Implements Rotary Positional Embeddings (RoPE) for transformers
+
+        This layer applies rotary positional embeddings to queries and keys in
+        self-attention layers, providing relative positional information without
+        absolute position embeddings.
+
+        The implementation follows the RoPE formulation from [2], where positions
+        are encoded through rotation matrices applied to pairs of dimensions.
+    !*/
     class rotary_positional_embedding_ {
     public:
         explicit rotary_positional_embedding_() = default;
@@ -386,7 +397,7 @@ namespace ernie
         struct model_info {
             static std::string describe() {
                 std::stringstream ss;
-                ss << "ERNIE Transformer model configuration:\n"
+                ss << "Transformer model configuration:\n"
                     << "- vocabulary size: " << VOCAB_SIZE << "\n"
                     << "- layers: " << NUM_LAYERS << "\n"
                     << "- attention heads: " << NUM_HEADS << "\n"
@@ -674,9 +685,9 @@ int main(int argc, char** argv)
         command_line_parser parser;
         parser.add_option("train", "Train a transformer model on enwiki");
         parser.add_option("generate", "Generate enwiki from a previously trained model");
-        parser.add_option("verify", "Verify generated output against original enwiki");
+        parser.add_option("verify", "Verify generated output against original data");
         parser.add_option("tokenize-only", "Only tokenize the input file and save tokens");
-        parser.add_option("enwiki", "Path to the enwiki file", 1);
+        parser.add_option("enwiki", "Path to the enwiki file (default: enwiki.txt)", 1);
         parser.add_option("max-tokens", "Maximum number of tokens to load in memory", 1);
         parser.add_option("max-bytes", "Maximum number of bytes to process from enwiki", 1);
         parser.add_option("percent", "Percentage of enwiki to process (0-100)", 1);
@@ -687,9 +698,9 @@ int main(int argc, char** argv)
         parser.add_option("alpha", "Set the weight decay for Adam (default: 0.004)", 1);
         parser.add_option("beta1", "Set Adam's first moment coefficient (default: 0.9)", 1);
         parser.add_option("beta2", "Set Adam's second moment coefficient (default: 0.999)", 1);
-        parser.add_option("model-file", "Path for model (default: ernie_model.dat)", 1);
+        parser.add_option("model-file", "Path for model (default: slm_enwiki_model.dat)", 1);
         parser.add_option("output-file", "Path for output (default: enwiki_generated.txt)", 1);
-        parser.add_option("tokenizer", "Path to pre-trained tokenizer (default: ernie_tokenizer.vocab)", 1);
+        parser.add_option("tokenizer", "Path to pre-trained tokenizer (default: enwiki_tokenizer.vocab)", 1);
         parser.add_option("tokens-file", "Path to pre-tokenized tokens file (optional)", 1);
         parser.add_option("force-tokenize", "Force tokenization even if tokens file exists");
         parser.parse(argc, argv);
@@ -710,14 +721,14 @@ int main(int argc, char** argv)
         const double alpha = get_option(parser, "alpha", 0.004);
         const double beta1 = get_option(parser, "beta1", 0.9);
         const double beta2 = get_option(parser, "beta2", 0.999);
-        const std::string model_file = get_option(parser, "model-file", "ernie_model.dat");
+        const std::string model_file = get_option(parser, "model-file", "slm_enwiki_model.dat");
         const std::string output_file = get_option(parser, "output-file", "enwiki_generated.txt");
-        const std::string enwiki_path = get_option(parser, "enwiki", "enwiki");
+        const std::string enwiki_path = get_option(parser, "enwiki", "enwiki.txt");
         const long max_seq_len = 180;
         const long num_layers = 2;
         const long num_heads = 6;
         const long embedding_dim = 228;
-        const std::string tokenizer_path = get_option(parser, "tokenizer", "ernie_tokenizer.vocab");
+        const std::string tokenizer_path = get_option(parser, "tokenizer", "enwiki_tokenizer.vocab");
         // Default number of prompt tokens = input sequence length
         const bool force_tokenize = parser.option("force-tokenize");
         const long num_tokens = 1000;
@@ -760,7 +771,7 @@ int main(int argc, char** argv)
             parser.option("tokens-file").argument() :
             generate_tokens_filename(enwiki_path, max_bytes);
 
-        using ernie_transformer = ernie::transformer_config<
+        using enwiki_transformer = transformer_config<
             num_tokens,     // vocab_size
             num_layers,     // number of layers
             num_heads,      // number of attention heads
@@ -945,9 +956,9 @@ int main(int argc, char** argv)
             cout << "Created " << samples.size() << " training samples (100%)...\n";
 
             // 5) Build and train the network
-            using net_type = ernie_transformer::network_type<true>;
+            using net_type = enwiki_transformer::network_type<true>;
             net_type net;
-            cout << "Model architecture:\n" << ernie_transformer::model_info::describe() << endl;
+            cout << "Model architecture:\n" << enwiki_transformer::model_info::describe() << endl;
             if (file_exists(model_file)) deserialize(model_file) >> net;
 
             // Create trainer
@@ -958,7 +969,7 @@ int main(int argc, char** argv)
             // For perfect memorization, we allow more epochs without improvement
             trainer.set_iterations_without_progress_threshold(patience);
             trainer.set_max_num_epochs(max_epochs); // More epochs for perfect memorization
-            trainer.set_synchronization_file("ernie_trainer.sync", std::chrono::minutes(10));
+            trainer.set_synchronization_file("enwiki_trainer.sync", std::chrono::minutes(10));
             trainer.be_quiet();
 
             // Custom training loop - trainer.train(samples, labels)
@@ -1027,27 +1038,29 @@ int main(int argc, char** argv)
             net.clean();
             serialize(model_file) << net;
             cout << "Model saved to " << model_file << "\n";
-            std::remove("ernie_trainer.sync");
-            std::remove("ernie_trainer.sync_");
+            std::remove("enwiki_trainer.sync");
+            std::remove("enwiki_trainer.sync_");
 
             // Evaluate on training set
-            if (!g_terminate_flag.load()) {
-                cout << "Evaluating model accuracy...\n";
-                using net_infer = ernie_transformer::network_type<false>;
-                net_infer g_infer = net;
-                auto predicted = g_infer(samples);
-                size_t correct = 0;
-                for (size_t i = 0; i < labels.size(); ++i)
-                    if (predicted[i] == labels[i]) correct++;
-                double accuracy = (double)correct / labels.size();
-                cout << "Training accuracy: " << (accuracy * 100.0) << "%\n";
-
-                // We need perfect accuracy to reconstruct enwiki
-                if (accuracy < 0.9999) {
-                    cout << "WARNING: Model accuracy is less than 99.99%. The model may not "
-                        << "perfectly reconstruct the input text.\n";
+            {
+                if (!g_terminate_flag.load()) {
+                    cout << "Evaluating model accuracy...\n";
+                    using net_infer = enwiki_transformer::network_type<false>;
+                    net_infer g_infer = net;
+                    auto predicted = g_infer(samples);
+                    size_t correct = 0;
+                    for (size_t i = 0; i < labels.size(); ++i)
+                        if (predicted[i] == labels[i]) correct++;
+                    double accuracy = (double)correct / labels.size();
+                    cout << "Training accuracy: " << (accuracy * 100.0) << "%\n";
+
+                    // We need perfect accuracy to reconstruct enwiki
+                    if (accuracy < 0.9999) {
+                        cout << "WARNING: Model accuracy is less than 99.99%. The model may not "
+                            << "perfectly reconstruct the input text.\n";
+                    }
                 }
-            }
+            }            
         }
 
         // ----------------------------------------------------------------------------------------
@@ -1058,7 +1071,7 @@ int main(int argc, char** argv)
             cout << "=== GENERATION MODE ===\n";
 
             // 1) Load the model
-            using net_infer = ernie_transformer::network_type<false>;
+            using net_infer = enwiki_transformer::network_type<false>;
             net_infer net;
             if (file_exists(model_file)) {
                 deserialize(model_file) >> net;

From a4dac0b64a0663320ad327d5fd815cf5bc870a2f Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Sun, 8 Jun 2025 14:33:19 +0200
Subject: [PATCH 19/21] Fixing break condition in training method.

---
 dlib/tokenizer/bpe_tokenizer.h | 83 +++++++++++++---------------------
 1 file changed, 31 insertions(+), 52 deletions(-)

diff --git a/dlib/tokenizer/bpe_tokenizer.h b/dlib/tokenizer/bpe_tokenizer.h
index e2a1ad40a5..edec421e86 100644
--- a/dlib/tokenizer/bpe_tokenizer.h
+++ b/dlib/tokenizer/bpe_tokenizer.h
@@ -31,36 +31,21 @@ namespace dlib
                 vocab[i] = std::vector<uint8_t>{ static_cast<uint8_t>(i) };
             
             // Initialize special tokens with sequential IDs
-            special_tokens =
-            {
-                {"<text>",      BASE_VOCAB_SIZE},
-                {"</text>",     BASE_VOCAB_SIZE + 1},
-                {"<url>",       BASE_VOCAB_SIZE + 2},
-                {"</url>",      BASE_VOCAB_SIZE + 3},
-                {"<image>",     BASE_VOCAB_SIZE + 4},
-                {"</image>",    BASE_VOCAB_SIZE + 5},
-                {"<video>",     BASE_VOCAB_SIZE + 6},
-                {"</video>",    BASE_VOCAB_SIZE + 7},
-                {"<audio>",     BASE_VOCAB_SIZE + 8},
-                {"</audio>",    BASE_VOCAB_SIZE + 9},
-                {"<file>",      BASE_VOCAB_SIZE + 10},
-                {"</file>",     BASE_VOCAB_SIZE + 11},
-                {"<code>",      BASE_VOCAB_SIZE + 12},
-                {"</code>",     BASE_VOCAB_SIZE + 13},
-                {"<summary>",   BASE_VOCAB_SIZE + 14},
-                {"</summary>",  BASE_VOCAB_SIZE + 15},
-                {"<think>",     BASE_VOCAB_SIZE + 16},
-                {"</think>",    BASE_VOCAB_SIZE + 17},
-                {"<start>",     BASE_VOCAB_SIZE + 18},
-                {"<end>",       BASE_VOCAB_SIZE + 19},
-                {"<user>",      BASE_VOCAB_SIZE + 20},
-                {"<bot>",       BASE_VOCAB_SIZE + 21},
-                {"<system>",    BASE_VOCAB_SIZE + 22},
-                {"<question>",  BASE_VOCAB_SIZE + 23},
-                {"<answer>",    BASE_VOCAB_SIZE + 24},
-                {"<search>",    BASE_VOCAB_SIZE + 25},
-                {"<unk>",       BASE_VOCAB_SIZE + 26},
-                {"<pad>",       BASE_VOCAB_SIZE + 27}
+            special_tokens = {
+                {"<text>", BASE_VOCAB_SIZE},            {"</text>", BASE_VOCAB_SIZE + 1},
+                {"<url>", BASE_VOCAB_SIZE + 2},         {"</url>", BASE_VOCAB_SIZE + 3},
+                {"<image>", BASE_VOCAB_SIZE + 4},       {"</image>", BASE_VOCAB_SIZE + 5},
+                {"<video>", BASE_VOCAB_SIZE + 6},       {"</video>", BASE_VOCAB_SIZE + 7},
+                {"<audio>", BASE_VOCAB_SIZE + 8},       {"</audio>", BASE_VOCAB_SIZE + 9},
+                {"<file>", BASE_VOCAB_SIZE + 10},       {"</file>", BASE_VOCAB_SIZE + 11},
+                {"<code>", BASE_VOCAB_SIZE + 12},       {"</code>", BASE_VOCAB_SIZE + 13},
+                {"<summary>", BASE_VOCAB_SIZE + 14},    {"</summary>", BASE_VOCAB_SIZE + 15},
+                {"<think>", BASE_VOCAB_SIZE + 16},      {"</think>", BASE_VOCAB_SIZE + 17},
+                {"<start>", BASE_VOCAB_SIZE + 18},      {"<end>", BASE_VOCAB_SIZE + 19},
+                {"<user>", BASE_VOCAB_SIZE + 20},       {"<bot>", BASE_VOCAB_SIZE + 21},
+                {"<system>", BASE_VOCAB_SIZE + 22},     {"<question>", BASE_VOCAB_SIZE + 23},
+                {"<answer>", BASE_VOCAB_SIZE + 24},     {"<search>", BASE_VOCAB_SIZE + 25},
+                {"<unk>", BASE_VOCAB_SIZE + 26},        {"<pad>", BASE_VOCAB_SIZE + 27}
             };
 
             // Initialize the vector of special token IDs
@@ -79,6 +64,7 @@ namespace dlib
 
             // Convert text to byte IDs
             std::vector<int> ids;
+            ids.reserve(text.size());
             for (char c : text) ids.push_back(static_cast<uint8_t>(c));
 
             // Perform BPE merges
@@ -88,38 +74,31 @@ namespace dlib
 
                 // Find the most frequent pair that does not exceed MAX_TOKEN_LENGTH
                 auto pair = get_most_frequent_pair(stats);
+                if (pair.first == -1) break;
 
                 // Check if the resulting token would exceed MAX_TOKEN_LENGTH
                 size_t new_token_length = vocab[pair.first].size() + vocab[pair.second].size();
                 if (new_token_length > MAX_TOKEN_LENGTH) {
                     if (verbose)
-                    {
-                        std::cout << "\r"
-                            << std::setw(100) << std::flush
-                            << "\rskipping merge " << std::to_string(i + 1) << "/" << std::to_string(num_merges) << ": ("
-                            << std::to_string(pair.first) << "," << std::to_string(pair.second) << ") -> new token length "
-                            << std::to_string(new_token_length) << " exceeds limit of " << std::to_string(MAX_TOKEN_LENGTH)
-                            << std::flush;
-                    }
+                        std::cout << "\r" << std::setw(100) << std::flush << "\r[skip] merge " << (i + 1)
+                        << ": token too long: " << new_token_length << "/" << MAX_TOKEN_LENGTH << std::flush;
                     continue; // Skip this merge
                 }
 
-                int idx = (BASE_VOCAB_SIZE + (int)special_tokens.size()) + i;
-                ids = merge(ids, pair, idx);
-                merges[pair] = idx;
-                vocab[idx].insert(vocab[idx].end(), vocab[pair.first].begin(), vocab[pair.first].end());
-                vocab[idx].insert(vocab[idx].end(), vocab[pair.second].begin(), vocab[pair.second].end());
+                int new_id = current_base + i;
+                merges[pair] = new_id;
+
+                std::vector<uint8_t>& new_token = vocab[new_id];
+                new_token.reserve(new_token_length);
+                new_token.insert(new_token.end(), vocab[pair.first].begin(), vocab[pair.first].end());
+                new_token.insert(new_token.end(), vocab[pair.second].begin(), vocab[pair.second].end());
+
+                ids = merge(ids, pair, new_id);
 
                 if (verbose)
-                {
-                    std::cout << "\r"
-                        << std::setw(100) << std::flush
-                        << "\rmerge " << std::to_string(i + 1) << "/" << std::to_string(num_merges) << ": ("
-                        << std::to_string(pair.first) << "," << std::to_string(pair.second) << ") -> " << std::to_string(idx)
-                        << " (" << bytes_to_string(vocab[idx]) << ") had "
-                        << std::to_string(stats[pair]) << " occurrences"
-                        << std::endl;
-                }
+                    std::cout << "\r" << std::setw(100) << std::flush << "\r[merge] " << (i + 1) << "/" << num_merges
+                    << ": (" << pair.first << "," << pair.second << ") -> " << new_id
+                    << " (" << bytes_to_string(vocab[new_id]) << ")" << std::endl;
             }
         }
 

From 63454e35bef2674c2a4f4ade59e95ae3b73ebf08 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Sun, 8 Jun 2025 16:27:57 +0200
Subject: [PATCH 20/21] Fixing declaration order of variables.

---
 dlib/dnn/layers.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index 83010fe9fb..2b0136ef91 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2337,8 +2337,8 @@ namespace dlib
 
     public:
         explicit linear_() :
-            num_inputs(0),
-            num_outputs(num_outputs_),            
+            num_outputs(num_outputs_),
+            num_inputs(0),                        
             learning_rate_multiplier(1),
             bias_mode(bias_mode_) {
         }
@@ -2368,8 +2368,7 @@ namespace dlib
 
         double get_learning_rate_multiplier() const { return learning_rate_multiplier; }
         void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
-
-        unsigned long get_num_inputs() const { return num_inputs; }
+        
         unsigned long get_num_outputs() const { return num_outputs; }
         void set_num_outputs(long num)
         {
@@ -2381,6 +2380,7 @@ namespace dlib
                 num_outputs = num;
             }
         }
+        unsigned long get_num_inputs() const { return num_inputs; }
         linear_bias_mode get_bias_mode() const { return bias_mode; }
 
         template <typename SUBNET>
@@ -2526,8 +2526,8 @@ namespace dlib
         }
 
     private:
-        unsigned long num_inputs;
         unsigned long num_outputs;
+        unsigned long num_inputs;        
         double learning_rate_multiplier;
         linear_bias_mode bias_mode;
         resizable_tensor params;

From 87ed70a3d77339d62c60c7d5f737c9e1f3014285 Mon Sep 17 00:00:00 2001
From: unknown <cydraltechnology@gmail.com>
Date: Sun, 8 Jun 2025 17:22:48 +0200
Subject: [PATCH 21/21] bpe_tokenizer improvements.

---
 dlib/tokenizer/bpe_tokenizer.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/dlib/tokenizer/bpe_tokenizer.h b/dlib/tokenizer/bpe_tokenizer.h
index edec421e86..642f7c760b 100644
--- a/dlib/tokenizer/bpe_tokenizer.h
+++ b/dlib/tokenizer/bpe_tokenizer.h
@@ -58,9 +58,8 @@ namespace dlib
         {
             int current_base = static_cast<int>(BASE_VOCAB_SIZE + special_tokens.size());
             DLIB_CASSERT(vocab_size >= current_base);
-            this->vocab_size = vocab_size;
             int num_merges = vocab_size - current_base;
-            DLIB_CASSERT(num_merges > 0);
+            if (num_merges <= 0) return;
 
             // Convert text to byte IDs
             std::vector<int> ids;
@@ -68,7 +67,8 @@ namespace dlib
             for (char c : text) ids.push_back(static_cast<uint8_t>(c));
 
             // Perform BPE merges
-            for (int i = 0; i < num_merges; ++i) {
+            int n_merges = 0;
+            for (; n_merges < num_merges; ++n_merges) {
                 auto stats = get_stats(ids);
                 if (stats.empty()) break;
 
@@ -80,12 +80,12 @@ namespace dlib
                 size_t new_token_length = vocab[pair.first].size() + vocab[pair.second].size();
                 if (new_token_length > MAX_TOKEN_LENGTH) {
                     if (verbose)
-                        std::cout << "\r" << std::setw(100) << std::flush << "\r[skip] merge " << (i + 1)
+                        std::cout << "\r" << std::setw(100) << std::flush << "\r[skip] merge " << (n_merges + 1)
                         << ": token too long: " << new_token_length << "/" << MAX_TOKEN_LENGTH << std::flush;
                     continue; // Skip this merge
                 }
 
-                int new_id = current_base + i;
+                int new_id = current_base + n_merges;
                 merges[pair] = new_id;
 
                 std::vector<uint8_t>& new_token = vocab[new_id];
@@ -96,10 +96,11 @@ namespace dlib
                 ids = merge(ids, pair, new_id);
 
                 if (verbose)
-                    std::cout << "\r" << std::setw(100) << std::flush << "\r[merge] " << (i + 1) << "/" << num_merges
+                    std::cout << "\r" << std::setw(100) << std::flush << "\r[merge] " << (n_merges + 1) << "/" << num_merges
                     << ": (" << pair.first << "," << pair.second << ") -> " << new_id
                     << " (" << bytes_to_string(vocab[new_id]) << ")" << std::endl;
             }
+            this->vocab_size = current_base + n_merges;
         }
 
         // Encode the given text into subword tokens without paragraph splitting or special token wrapping
@@ -271,7 +272,7 @@ namespace dlib
         // Save the tokenizer model and vocabulary to file
         friend void serialize(const bpe_tokenizer& tok, std::ostream& out)
         {
-            serialize("bpe_tokenizer2_", out);
+            serialize("bpe_tokenizer_", out);
             serialize(tok.special_tokens, out);
             serialize(tok.special_token_map, out);
             serialize(tok.merges, out);
@@ -283,7 +284,7 @@ namespace dlib
         friend void deserialize(bpe_tokenizer& tok, std::istream& in) {
             std::string version;
             dlib::deserialize(version, in);
-            if (version != "bpe_tokenizer2_")
+            if (version != "bpe_tokenizer_")
                 throw dlib::serialization_error("Unexpected version '" + version + "' found while deserializing dlib::bpe_tokenizer_.");
             deserialize(tok.special_tokens, in);
             deserialize(tok.special_token_map, in);
@@ -366,14 +367,16 @@ namespace dlib
             // Iterate over all pairs in the statistics map
             for (const auto& stat : stats) {
                 const std::pair<int, int>& pair = stat.first; // Extract the token pair
-                int count = stat.second; // Extract the frequency count
+                int frequency = stat.second; // Extract the frequency
 
                 // Check if the new token formed by merging the pair would exceed the maximum allowed length
                 size_t new_token_length = vocab.at(pair.first).size() + vocab.at(pair.second).size();
                 if (new_token_length > MAX_TOKEN_LENGTH) continue; // Skip this pair if it exceeds the maximum token length
 
                 // Calculate the score for this pair (frequency * length_penalty)
-                double score = (size_t)count * (new_token_length > (MAX_TOKEN_LENGTH / 2) ? 1.75 : 1.0);
+                double length_bonus = std::min(2.0, 1.0 + (static_cast<double>(new_token_length) - 2.0) * 0.1);
+                double frequency_weight = std::log1p(frequency);
+                double score = frequency_weight * length_bonus;
 
                 // Update the best pair if the current pair has a higher score
                 if (score > max_score)