diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index 7ec8b1a956..2b0136ef91 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2329,24 +2329,46 @@ namespace dlib
 
     template <
         unsigned long num_outputs_,
-        linear_bias_mode bias_mode_
+        linear_bias_mode bias_mode_ = LINEAR_HAS_BIAS
     >
     class linear_
     {
         static_assert(num_outputs_ > 0, "The number of outputs from a linear_ layer must be > 0");
 
     public:
-        linear_() :
+        explicit linear_() :
             num_outputs(num_outputs_),
-            num_inputs(0),
+            num_inputs(0),                        
             learning_rate_multiplier(1),
             bias_mode(bias_mode_) {
         }
 
+        linear_(const linear_& other) :
+            num_outputs(other.num_outputs),
+            num_inputs(other.num_inputs),
+            learning_rate_multiplier(other.learning_rate_multiplier),
+            bias_mode(other.bias_mode),
+            params(other.params),
+            weights(other.weights),
+            biases(other.biases) {
+        }
+
+        linear_& operator=(const linear_& other) {
+            if (this != &other) {
+                num_outputs = other.num_outputs;
+                num_inputs = other.num_inputs;
+                learning_rate_multiplier = other.learning_rate_multiplier;
+                bias_mode = other.bias_mode;
+                params = other.params;
+                weights = other.weights;
+                biases = other.biases;
+            }
+            return *this;
+        }
+
         double get_learning_rate_multiplier() const { return learning_rate_multiplier; }
         void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
-
-        unsigned long get_num_inputs() const { return num_inputs; }
+        
         unsigned long get_num_outputs() const { return num_outputs; }
         void set_num_outputs(long num)
         {
@@ -2358,6 +2380,7 @@ namespace dlib
                 num_outputs = num;
             }
         }
+        unsigned long get_num_inputs() const { return num_inputs; }
         linear_bias_mode get_bias_mode() const { return bias_mode; }
 
         template <typename SUBNET>
@@ -2503,8 +2526,8 @@ namespace dlib
         }
 
     private:
-        unsigned long num_inputs;
         unsigned long num_outputs;
+        unsigned long num_inputs;        
         double learning_rate_multiplier;
         linear_bias_mode bias_mode;
         resizable_tensor params;
@@ -2515,7 +2538,7 @@ namespace dlib
         unsigned long num_outputs,
         typename SUBNET
     >
-    using linear = add_layer<linear_<num_outputs, LINEAR_HAS_BIAS>, SUBNET>;
+    using linear = add_layer<linear_<num_outputs>, SUBNET>;
 
     template <
         unsigned long num_outputs,
diff --git a/dlib/tokenizer/bpe_tokenizer.h b/dlib/tokenizer/bpe_tokenizer.h
index f9457b554f..642f7c760b 100644
--- a/dlib/tokenizer/bpe_tokenizer.h
+++ b/dlib/tokenizer/bpe_tokenizer.h
@@ -20,49 +20,32 @@
 
 namespace dlib
 {
-    constexpr size_t BPE_TOKENIZER_MAX_TOKEN_LENGTH = 8;
-    constexpr int BPE_TOKENIZER_BASE_VOCAB_SIZE = 256;
 
     class bpe_tokenizer
     {
     public:
-        bpe_tokenizer() : vocab_size(BPE_TOKENIZER_BASE_VOCAB_SIZE)
+        bpe_tokenizer() : vocab_size(BASE_VOCAB_SIZE)
         {
             // Initialize the base vocabulary with single bytes
-            for (int i = 0; i < BPE_TOKENIZER_BASE_VOCAB_SIZE; ++i)
+            for (int i = 0; i < BASE_VOCAB_SIZE; ++i)
                 vocab[i] = std::vector<uint8_t>{ static_cast<uint8_t>(i) };
             
             // Initialize special tokens with sequential IDs
-            special_tokens =
-            {
-                {"<text>",      BPE_TOKENIZER_BASE_VOCAB_SIZE},
-                {"</text>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 1},
-                {"<url>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 2},
-                {"</url>",      BPE_TOKENIZER_BASE_VOCAB_SIZE + 3},
-                {"<image>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 4},
-                {"</image>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 5},
-                {"<video>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 6},
-                {"</video>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 7},
-                {"<audio>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 8},
-                {"</audio>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 9},
-                {"<file>",      BPE_TOKENIZER_BASE_VOCAB_SIZE + 10},
-                {"</file>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 11},
-                {"<code>",      BPE_TOKENIZER_BASE_VOCAB_SIZE + 12},
-                {"</code>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 13},
-                {"<summary>",   BPE_TOKENIZER_BASE_VOCAB_SIZE + 14},
-                {"</summary>",  BPE_TOKENIZER_BASE_VOCAB_SIZE + 15},
-                {"<think>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 16},
-                {"</think>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 17},
-                {"<start>",     BPE_TOKENIZER_BASE_VOCAB_SIZE + 18},
-                {"<end>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 19},
-                {"<user>",      BPE_TOKENIZER_BASE_VOCAB_SIZE + 20},
-                {"<bot>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 21},
-                {"<system>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 22},
-                {"<question>",  BPE_TOKENIZER_BASE_VOCAB_SIZE + 23},
-                {"<answer>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 24},
-                {"<search>",    BPE_TOKENIZER_BASE_VOCAB_SIZE + 25},
-                {"<unk>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 26},
-                {"<pad>",       BPE_TOKENIZER_BASE_VOCAB_SIZE + 27}
+            special_tokens = {
+                {"<text>", BASE_VOCAB_SIZE},            {"</text>", BASE_VOCAB_SIZE + 1},
+                {"<url>", BASE_VOCAB_SIZE + 2},         {"</url>", BASE_VOCAB_SIZE + 3},
+                {"<image>", BASE_VOCAB_SIZE + 4},       {"</image>", BASE_VOCAB_SIZE + 5},
+                {"<video>", BASE_VOCAB_SIZE + 6},       {"</video>", BASE_VOCAB_SIZE + 7},
+                {"<audio>", BASE_VOCAB_SIZE + 8},       {"</audio>", BASE_VOCAB_SIZE + 9},
+                {"<file>", BASE_VOCAB_SIZE + 10},       {"</file>", BASE_VOCAB_SIZE + 11},
+                {"<code>", BASE_VOCAB_SIZE + 12},       {"</code>", BASE_VOCAB_SIZE + 13},
+                {"<summary>", BASE_VOCAB_SIZE + 14},    {"</summary>", BASE_VOCAB_SIZE + 15},
+                {"<think>", BASE_VOCAB_SIZE + 16},      {"</think>", BASE_VOCAB_SIZE + 17},
+                {"<start>", BASE_VOCAB_SIZE + 18},      {"<end>", BASE_VOCAB_SIZE + 19},
+                {"<user>", BASE_VOCAB_SIZE + 20},       {"<bot>", BASE_VOCAB_SIZE + 21},
+                {"<system>", BASE_VOCAB_SIZE + 22},     {"<question>", BASE_VOCAB_SIZE + 23},
+                {"<answer>", BASE_VOCAB_SIZE + 24},     {"<search>", BASE_VOCAB_SIZE + 25},
+                {"<unk>", BASE_VOCAB_SIZE + 26},        {"<pad>", BASE_VOCAB_SIZE + 27}
             };
 
             // Initialize the vector of special token IDs
@@ -73,57 +56,99 @@ namespace dlib
         // Train the tokenizer on the given text
         void train(const std::string& text, int vocab_size, bool verbose = false)
         {
-            DLIB_CASSERT(vocab_size >= BPE_TOKENIZER_BASE_VOCAB_SIZE);
-            this->vocab_size = vocab_size;
-            int num_merges = vocab_size - BPE_TOKENIZER_BASE_VOCAB_SIZE;
+            int current_base = static_cast<int>(BASE_VOCAB_SIZE + special_tokens.size());
+            DLIB_CASSERT(vocab_size >= current_base);
+            int num_merges = vocab_size - current_base;
+            if (num_merges <= 0) return;
 
             // Convert text to byte IDs
             std::vector<int> ids;
+            ids.reserve(text.size());
             for (char c : text) ids.push_back(static_cast<uint8_t>(c));
 
             // Perform BPE merges
-            for (int i = 0; i < num_merges; ++i) {
+            int n_merges = 0;
+            for (; n_merges < num_merges; ++n_merges) {
                 auto stats = get_stats(ids);
                 if (stats.empty()) break;
 
-                // Find the most frequent pair that does not exceed BPE_TOKENIZER_MAX_TOKEN_LENGTH
+                // Find the most frequent pair that does not exceed MAX_TOKEN_LENGTH
                 auto pair = get_most_frequent_pair(stats);
+                if (pair.first == -1) break;
 
-                // Check if the resulting token would exceed BPE_TOKENIZER_MAX_TOKEN_LENGTH
+                // Check if the resulting token would exceed MAX_TOKEN_LENGTH
                 size_t new_token_length = vocab[pair.first].size() + vocab[pair.second].size();
-                if (new_token_length > BPE_TOKENIZER_MAX_TOKEN_LENGTH) {
+                if (new_token_length > MAX_TOKEN_LENGTH) {
                     if (verbose)
-                    {
-                        std::cout << "\r"
-                            << std::setw(100) << std::flush
-                            << "\rskipping merge " << std::to_string(i + 1) << "/" << std::to_string(num_merges) << ": ("
-                            << std::to_string(pair.first) << "," << std::to_string(pair.second) << ") -> new token length "
-                            << std::to_string(new_token_length) << " exceeds limit of " << std::to_string(BPE_TOKENIZER_MAX_TOKEN_LENGTH)
-                            << std::flush;
-                    }
+                        std::cout << "\r" << std::setw(100) << std::flush << "\r[skip] merge " << (n_merges + 1)
+                        << ": token too long: " << new_token_length << "/" << MAX_TOKEN_LENGTH << std::flush;
                     continue; // Skip this merge
                 }
 
-                int idx = (BPE_TOKENIZER_BASE_VOCAB_SIZE + (int)special_tokens.size()) + i;
-                ids = merge(ids, pair, idx);
-                merges[pair] = idx;
-                vocab[idx].insert(vocab[idx].end(), vocab[pair.first].begin(), vocab[pair.first].end());
-                vocab[idx].insert(vocab[idx].end(), vocab[pair.second].begin(), vocab[pair.second].end());
+                int new_id = current_base + n_merges;
+                merges[pair] = new_id;
+
+                std::vector<uint8_t>& new_token = vocab[new_id];
+                new_token.reserve(new_token_length);
+                new_token.insert(new_token.end(), vocab[pair.first].begin(), vocab[pair.first].end());
+                new_token.insert(new_token.end(), vocab[pair.second].begin(), vocab[pair.second].end());
+
+                ids = merge(ids, pair, new_id);
 
                 if (verbose)
-                {
-                    std::cout << "\r"
-                        << std::setw(100) << std::flush
-                        << "\rmerge " << std::to_string(i + 1) << "/" << std::to_string(num_merges) << ": ("
-                        << std::to_string(pair.first) << "," << std::to_string(pair.second) << ") -> " << std::to_string(idx)
-                        << " (" << bytes_to_string(vocab[idx]) << ") had "
-                        << std::to_string(stats[pair]) << " occurrences"
-                        << std::endl;
+                    std::cout << "\r" << std::setw(100) << std::flush << "\r[merge] " << (n_merges + 1) << "/" << num_merges
+                    << ": (" << pair.first << "," << pair.second << ") -> " << new_id
+                    << " (" << bytes_to_string(vocab[new_id]) << ")" << std::endl;
+            }
+            this->vocab_size = current_base + n_merges;
+        }
+
+        // Encode the given text into subword tokens without paragraph splitting or special token wrapping
+        std::vector<int> encode_raw(const std::string& text) const
+        {
+            // Direct encoding without paragraph splitting or special tokens
+            std::vector<int> ids;
+            ids.reserve(text.size());
+
+            // Convert text to character IDs
+            for (char c : text) ids.push_back(static_cast<uint8_t>(c));
+
+            // Apply BPE merges
+            auto stats = get_stats(ids);
+            std::priority_queue<std::pair<int, std::pair<int, int>>> pq;
+            for (const auto& stat : stats) {
+                const std::pair<int, int>& pair = stat.first;
+                if (merges.count(pair)) pq.push({ merges.at(pair), pair });
+            }
+
+            while (!pq.empty()) {
+                const auto& top_element = pq.top();
+                const std::pair<int, int>& pair = top_element.second;
+                pq.pop();
+
+                bool pair_found = false;
+                for (size_t i = 0; i < ids.size() - 1; ++i) {
+                    if (ids[i] == pair.first && ids[i + 1] == pair.second) {
+                        pair_found = true;
+                        break;
+                    }
+                }
+                if (!pair_found) continue;
+
+                int idx = merges.at(pair);
+                ids = merge(ids, pair, idx);
+
+                stats = get_stats(ids);
+                for (const auto& stat : stats) {
+                    const std::pair<int, int>& new_pair = stat.first;
+                    if (merges.count(new_pair)) pq.push({ merges.at(new_pair), new_pair });
                 }
             }
+
+            return ids;
         }
 
-        // Encode the given text into subword tokens
+        // Encode the given text into subword tokens (advanced version)
         std::vector<int> encode(const std::string& text) const
         {
             std::vector<int> result_ids;
@@ -247,7 +272,7 @@ namespace dlib
         // Save the tokenizer model and vocabulary to file
         friend void serialize(const bpe_tokenizer& tok, std::ostream& out)
         {
-            serialize("bpe_tokenizer2_", out);
+            serialize("bpe_tokenizer_", out);
             serialize(tok.special_tokens, out);
             serialize(tok.special_token_map, out);
             serialize(tok.merges, out);
@@ -259,7 +284,7 @@ namespace dlib
         friend void deserialize(bpe_tokenizer& tok, std::istream& in) {
             std::string version;
             dlib::deserialize(version, in);
-            if (version != "bpe_tokenizer2_")
+            if (version != "bpe_tokenizer_")
                 throw dlib::serialization_error("Unexpected version '" + version + "' found while deserializing dlib::bpe_tokenizer_.");
             deserialize(tok.special_tokens, in);
             deserialize(tok.special_token_map, in);
@@ -289,6 +314,9 @@ namespace dlib
         std::map<int, std::vector<uint8_t>> vocab;
         int vocab_size;
 
+        static const size_t MAX_TOKEN_LENGTH = 8;
+        static const int BASE_VOCAB_SIZE = 256;
+
         // Get frequency statistics of adjacent token pairs
         struct pair_hash {
             template <class T1, class T2>
@@ -339,14 +367,16 @@ namespace dlib
             // Iterate over all pairs in the statistics map
             for (const auto& stat : stats) {
                 const std::pair<int, int>& pair = stat.first; // Extract the token pair
-                int count = stat.second; // Extract the frequency count
+                int frequency = stat.second; // Extract the frequency
 
                 // Check if the new token formed by merging the pair would exceed the maximum allowed length
                 size_t new_token_length = vocab.at(pair.first).size() + vocab.at(pair.second).size();
-                if (new_token_length > BPE_TOKENIZER_MAX_TOKEN_LENGTH) continue; // Skip this pair if it exceeds the maximum token length
+                if (new_token_length > MAX_TOKEN_LENGTH) continue; // Skip this pair if it exceeds the maximum token length
 
                 // Calculate the score for this pair (frequency * length_penalty)
-                double score = (size_t)count * (new_token_length > (BPE_TOKENIZER_MAX_TOKEN_LENGTH / 2) ? 1.75 : 1.0);
+                double length_bonus = std::min(2.0, 1.0 + (static_cast<double>(new_token_length) - 2.0) * 0.1);
+                double frequency_weight = std::log1p(frequency);
+                double score = frequency_weight * length_bonus;
 
                 // Update the best pair if the current pair has a higher score
                 if (score > max_score)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c23067879a..1232d58b09 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -147,6 +147,7 @@ add_gui_example(dnn_dcgan_train_ex)
 add_gui_example(dnn_yolo_train_ex)
 add_gui_example(dnn_self_supervised_learning_ex)
 add_example(slm_basic_train_ex)
+add_example(slm_advanced_train_ex)
 add_gui_example(3d_point_cloud_ex)
 add_example(bayes_net_ex)
 add_example(bayes_net_from_disk_ex)
diff --git a/examples/slm_advanced_train_ex.cpp b/examples/slm_advanced_train_ex.cpp
new file mode 100644
index 0000000000..b655b13ba1
--- /dev/null
+++ b/examples/slm_advanced_train_ex.cpp
@@ -0,0 +1,1305 @@
+﻿/*!
+    @file slm_advanced_train_ex.cpp
+    @brief Transformer-based text training/generation
+
+    This program implements a complete training and generation pipeline for a
+    Transformer-based text compression system.
+    The model features:
+
+    1. Rotary Positional Embeddings (RoPE) for enhanced positional encoding
+    2. Multi-head self-attention with efficient memory handling
+    3. Mixture-of-Experts architecture for specialized processing
+    4. BPE tokenization with custom vocabulary
+    5. Full training/generation/verification workflow
+
+    Key capabilities demonstrated:
+    - Perfect memorization and reproduction of training text
+    - Efficient autoregressive generation
+    - Byte-level verification of reconstructed text
+
+    References:
+    [1] Vaswani et al., "Attention Is All You Need" (Transformer architecture)
+        arXiv:1706.03762
+    [2] Su et al., "RoFormer: Enhanced Transformer with Rotary Position Embedding"
+        arXiv:2104.09864
+    [3] Shazeer et al., "Outrageously Large Neural Networks: The Sparsely-Gated
+        Mixture-of-Experts Layer" (MoE architecture) arXiv:1701.06538
+
+    Usage modes:
+    --train         Train model on enwiki dataset
+    --generate      Generate text from trained model
+    --verify        Compare generated output with original
+    --tokenize-only Only perform tokenization step
+
+    Configuration:
+    - Adjust template parameters in transformer_config for model architecture
+    - Modify training parameters in main() for optimization
+    - Set sequence length and memory limits according to available hardware
+!*/
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <random>
+#include <fstream>
+#include <chrono>
+#include <algorithm>
+#include <csignal>
+#include <dlib/data_io.h>
+#include <dlib/cmd_line_parser.h>
+#include <dlib/misc_api.h>
+#include <dlib/tokenizer/bpe_tokenizer.h>
+#include <dlib/serialize.h>
+#include <dlib/dnn.h>
+
+using namespace std;
+using namespace dlib;
+
+namespace dlib
+{
+    /*!
+        @class rotary_positional_embedding_
+        @brief Implements Rotary Positional Embeddings (RoPE) for transformers
+
+        This layer applies rotary positional embeddings to queries and keys in
+        self-attention layers, providing relative positional information without
+        absolute position embeddings.
+
+        The implementation follows the RoPE formulation from [2], where positions
+        are encoded through rotation matrices applied to pairs of dimensions.
+    !*/
+    class rotary_positional_embedding_ {
+    public:
+        explicit rotary_positional_embedding_() = default;
+
+        template <typename SUBNET>
+        void setup(const SUBNET& sub) {
+            // Precompute the rotation angles and their trigonometric values
+            seq_len = sub.get_output().nr();
+            d_head = sub.get_output().nc();
+            compute_rotation_angles();
+            precompute_trigonometric_values();
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output) {
+            const tensor& input = sub.get_output();
+            output.copy_size(input);
+            tt::copy_tensor(false, output, 0, input, 0, input.k());
+
+            // Apply rotary embedding to the output
+            apply_rotary_embedding(output);
+        }
+
+        template <typename SUBNET>
+        void backward(
+            const tensor& gradient_input,
+            SUBNET& sub,
+            tensor& params_grad
+        ) {
+            tensor& prev = sub.get_gradient_input();
+            resizable_tensor grad_output;
+            grad_output.copy_size(gradient_input);
+            tt::copy_tensor(false, grad_output, 0, gradient_input, 0, gradient_input.k());
+
+            // Apply the inverse rotation to the gradient (transpose of the rotation matrix)
+            apply_rotary_embedding(grad_output, true);
+            tt::copy_tensor(true, prev, 0, grad_output, 0, grad_output.k());
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const rotary_positional_embedding_& item, std::ostream& out) {
+            std::string version = "rotary_positional_embedding_";
+            dlib::serialize(version, out);
+            dlib::serialize(item.seq_len, out);
+            dlib::serialize(item.d_head, out);
+            dlib::serialize(item.angles, out);
+            dlib::serialize(item.cos_values, out);
+            dlib::serialize(item.sin_values, out);
+        }
+
+        friend void deserialize(rotary_positional_embedding_& item, std::istream& in) {
+            std::string version;
+            dlib::deserialize(version, in);
+            if (version != "rotary_positional_embedding_")
+                throw serialization_error("Unexpected version found while deserializing rotary_positional_embedding_.");
+            dlib::deserialize(item.seq_len, in);
+            dlib::deserialize(item.d_head, in);
+            dlib::deserialize(item.angles, in);
+            dlib::deserialize(item.cos_values, in);
+            dlib::deserialize(item.sin_values, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const rotary_positional_embedding_& item) {
+            out << "rotary_positional_embedding";
+            out << " (d_head=" << item.d_head << ", seq_len=" << item.seq_len << ")";
+            return out;
+        }
+
+        friend void to_xml(const rotary_positional_embedding_& item, std::ostream& out)
+        {
+            out << "<rotary_positional_embedding"
+                << " d_head='" << item.d_head << "'"
+                << " seq_len='" << item.seq_len << "'"
+                << "/>\n";
+        }
+
+    protected:
+        void compute_rotation_angles() {
+            // Following the original RoPE paper formulation
+            const float base = 10000.0f;
+            const long half_dim = d_head / 2;
+            angles.set_size(seq_len, half_dim);
+
+            for (long pos = 0; pos < seq_len; ++pos) {
+                for (long i = 0; i < half_dim; ++i) {
+                    float inv_freq = std::pow(base, -2.0f * (i + 0.5f) / d_head);
+                    angles(pos, i) = pos * inv_freq;
+                }
+            }
+        }
+
+        void precompute_trigonometric_values() {
+            // Precompute cos and sin for all angles
+            cos_values.set_size(angles.nr(), angles.nc());
+            sin_values.set_size(angles.nr(), angles.nc());
+
+            for (long i = 0; i < angles.size(); ++i) {
+                cos_values(i) = std::cos(angles(i));
+                sin_values(i) = std::sin(angles(i));
+            }
+        }
+
+        template <typename tensor_type>
+        void apply_rotary_embedding(
+            tensor_type& x,
+            bool is_backward = false
+        ) const {
+            const long batch_size = x.num_samples();
+            const long num_heads = x.k();
+            const long seq_length = x.nr();
+            const long dim = x.nc();
+            const bool is_odd = (dim % 2 != 0);
+            const long rot_dim = is_odd ? dim - 1 : dim;
+
+            DLIB_CASSERT(dim == d_head, "Input dimension must match d_head param");
+            DLIB_CASSERT(seq_length == seq_len, "Sequence length must match seq_len param");
+
+            auto* ptr = x.host();
+            const long stride = seq_length * dim;
+
+            for (long n = 0; n < batch_size; ++n) {
+                for (long h = 0; h < num_heads; ++h) {
+                    auto* x_ptr = ptr + (n * num_heads + h) * stride;
+
+                    for (long pos = 0; pos < seq_length; ++pos) {
+                        const float* cos = &cos_values(pos, 0);
+                        const float* sin = &sin_values(pos, 0);
+
+                        for (long i = 0; i < rot_dim; i += 2) {
+                            const float x0 = x_ptr[pos * dim + i];
+                            const float x1 = x_ptr[pos * dim + i + 1];
+
+                            if (!is_backward) {
+                                x_ptr[pos * dim + i] = x0 * cos[i / 2] - x1 * sin[i / 2];
+                                x_ptr[pos * dim + i + 1] = x0 * sin[i / 2] + x1 * cos[i / 2];
+                            }
+                            else {
+                                x_ptr[pos * dim + i] = x0 * cos[i / 2] + x1 * sin[i / 2];
+                                x_ptr[pos * dim + i + 1] = -x0 * sin[i / 2] + x1 * cos[i / 2];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    private:
+        long seq_len, d_head;       // Sequence length and dimension of each head
+        matrix<float> angles;       // Precomputed rotation angles (seq_len x d_head/2)
+        matrix<float> cos_values;   // Precomputed cosine values
+        matrix<float> sin_values;   // Precomputed sine values
+        resizable_tensor params;    // Empty tensor (no learnable parameters)
+    };
+
+    // Helper to easily add RoPE to a network
+    template <typename SUBNET>
+    using rope = add_layer<rotary_positional_embedding_, SUBNET>;
+
+    template <long d_k_>
+    class scale_weights_ : public multiply_ {
+    public:
+        explicit scale_weights_() : multiply_(1.0f / std::sqrt(static_cast<float>(d_k_))) {}
+    };
+
+    template <long d_k, typename SUBNET>
+    using scale_weights = add_layer<scale_weights_<d_k>, SUBNET>;
+
+    // Attention mechanism component extractors
+    template <long seq_len, long d_model, long num_heads, typename SUBNET>
+    using query = reshape_to<num_heads, seq_len, d_model / num_heads, linear_no_bias<d_model, SUBNET>>;
+
+    template <long seq_len, long d_model, long num_heads, typename SUBNET>
+    using key = reshape_to<num_heads, seq_len, d_model / num_heads, linear_no_bias<d_model, SUBNET>>;
+
+    template <long seq_len, long d_model, long num_heads, typename SUBNET>
+    using value = reshape_to<num_heads, seq_len, d_model / num_heads, linear_no_bias<d_model, SUBNET>>;
+
+    /*!
+        This layer implements multi-head self-attention.
+
+        Template parameters:
+            - ACT: Activation function type
+            - DO: Dropout layer type for regularization
+            - d_model: Model dimension (must be divisible by num_heads)
+            - num_heads: Number of attention heads
+    !*/
+    template <template <typename> class ACT, template <typename> class DO,
+        long seq_len, long d_model, long num_heads, typename SUBNET>
+    using multihead_attention =
+        rms_norm<add_prev1<
+        DO<linear_no_bias<d_model, reshape_to<1, seq_len, d_model,
+        multm_prev2<softmaxm<tril_mask<
+        scale_weights<d_model / num_heads,
+        multm_prev3<
+        // Apply RoPE to queries & keys
+        rope<query<seq_len, d_model, num_heads, skip1<
+        tag3<transpose<
+        rope<key<seq_len, d_model, num_heads, skip1<
+        tag2<value<seq_len, d_model, num_heads,
+        tag1<SUBNET>>>>>>>>>>>>>>>>>>>>>;
+
+    template <template <typename> class DO, long num_experts, typename SUBNET>
+    using moe_router = softmax<fc<num_experts, avg_pool_everything<
+        DO<leaky_relu<fc<16, DO<leaky_relu<fc<32,
+        DO<fc<16, SUBNET>>>>>>>>>>>;
+
+    // Single expert network
+    template <template <typename> class ACT, template <typename> class DO,
+        long d_model, typename SUBNET>
+    using expert = DO<linear<d_model, ACT<DO<linear<d_model * 4, SUBNET>>>>>;
+
+    // Combines expert outputs using router probabilities
+    // Performs weighted sum of experts with residual connection
+    template <template <typename> class ACT, template <typename> class DO,
+        long d_model, typename SUBNET>
+    using weighted_sum_of_experts = add_prev<itag3,
+        mult_prev<itag1, extract<0, 1, 1, 1, skip6<         // Expert 1
+        itag1<expert<ACT, DO, d_model, iskip<
+        itag3<mult_prev<itag2, extract<1, 1, 1, 1, skip6<   // Expert 2
+        itag2<expert<ACT, DO, d_model,
+        itag0<SUBNET>>>>>>>>>>>>>>;
+
+    // Complete MoE feed-forward layer
+    template <template <typename> class ACT, template <typename> class DO,
+        long d_model, typename SUBNET>
+    using moe_feed_forward =
+        rms_norm<add_prev5<
+        weighted_sum_of_experts<ACT, DO, d_model, skip5<
+        tag6<moe_router<DO, 2,
+        tag5<SUBNET>>>>>>>;
+
+    /*!
+        This defines a standard transformer encoder block with self-attention
+        followed by a feed-forward network, each with residual connections.
+
+        Template parameters:
+            - ACT: Activation function type
+            - DO: Dropout layer type for regularization
+            - seq_len: Sequence length (number of tokens/patches)
+            - d_model: Model dimension
+            - num_heads: Number of attention heads
+    !*/
+    template <template <typename> class ACT, template <typename> class DO,
+        long seq_len, long d_model, long num_heads, typename SUBNET>
+    using transformer_block =
+        moe_feed_forward<ACT, DO, d_model,
+        multihead_attention<ACT, DO, seq_len, d_model, num_heads, SUBNET>>;
+
+    // Positional Embeddings
+    template <long num_embeddings, long embedding_length, typename SUBNET>
+    using positional_embeddings = layer_norm<positional_encodings<embeddings<num_embeddings, embedding_length, SUBNET>>>;
+
+    // Classification Head   
+    template <template <typename> class ACT, long num_logits, long embedding_length, typename SUBNET>
+    using classification_head = loss_multiclass_log<fc<num_logits, avg_pool_everything<SUBNET>>>;
+
+    /**
+     * @brief Transformer Model Configuration Template
+     *
+     * Provides a flexible and type-safe configuration mechanism for Transformer models
+     * with compile-time parameter validation and network generation.
+     *
+     * Template parameters:
+     * @param vocab_size Vocabulary size for token embedding
+     * @param num_layers Number of Transformer layers
+     * @param num_heads Number of attention heads
+     * @param embedding_dim Dimension of token embeddings
+     * @param max_seq_len Maximum sequence length
+     * @param activation_func Activation function type
+     * @param dropout_policy Dropout regularization policy
+     */
+    template <
+        long vocab_size = 5000,                                 // Default vocabulary size
+        long num_layers = 6,                                    // Default number of layers
+        long num_heads = 8,                                     // Default number of attention heads
+        long embedding_dim = 128,                               // Default embedding dimension
+        long max_seq_len = 300,                                 // Default maximum sequence length
+        template <typename> class activation_func = gelu,       // Default activation function
+        template <typename> class dropout_policy = dropout_10   // Default dropout policy
+    >
+    struct transformer_config {
+        // Core model parameters
+        static constexpr long VOCAB_SIZE = vocab_size;
+        static constexpr long NUM_LAYERS = num_layers;
+        static constexpr long NUM_HEADS = num_heads;
+        static constexpr long EMBEDDING_DIM = embedding_dim;
+        static constexpr long MAX_SEQ_LEN = max_seq_len;
+
+        /**
+         * @brief Compile-time validation of model configuration
+         *
+         * Performs static assertions to ensure valid model parameters
+         */
+        struct validation {
+            static_assert(VOCAB_SIZE > 0, "Vocabulary size must be positive");
+            static_assert(NUM_LAYERS > 0, "Number of layers must be positive");
+            static_assert(NUM_HEADS > 0, "Number of attention heads must be positive");
+            static_assert(EMBEDDING_DIM% NUM_HEADS == 0, "Embedding dimension must be divisible by number of heads");
+        };
+
+        // Network component definitions
+        template <typename SUBNET>
+        using t_projection = fc<EMBEDDING_DIM, relu<bn_fc<fc<EMBEDDING_DIM * 2, SUBNET>>>>;
+        template <typename SUBNET>
+        using i_projection = fc<EMBEDDING_DIM, relu<affine<fc<EMBEDDING_DIM * 2, SUBNET>>>>;
+
+        template <typename SUBNET>
+        using t_transformer_block =
+            transformer_block<activation_func, dropout_policy, MAX_SEQ_LEN, EMBEDDING_DIM, NUM_HEADS, SUBNET>;
+
+        template <typename SUBNET>
+        using i_transformer_block =
+            transformer_block<activation_func, multiply, MAX_SEQ_LEN, EMBEDDING_DIM, NUM_HEADS, SUBNET>;
+
+        template<bool is_training>
+        using network_type = std::conditional_t<is_training,
+            classification_head<activation_func, VOCAB_SIZE, EMBEDDING_DIM,
+            t_projection<repeat<NUM_LAYERS, t_transformer_block,
+            positional_embeddings<VOCAB_SIZE, EMBEDDING_DIM, input<matrix<int, 0, 1>>>>>>,
+            classification_head<activation_func, VOCAB_SIZE, EMBEDDING_DIM,
+            i_projection<repeat<NUM_LAYERS, i_transformer_block,
+            positional_embeddings<VOCAB_SIZE, EMBEDDING_DIM, input<matrix<int, 0, 1>>>>>>>;
+
+        struct model_info {
+            static std::string describe() {
+                std::stringstream ss;
+                ss << "Transformer model configuration:\n"
+                    << "- vocabulary size: " << VOCAB_SIZE << "\n"
+                    << "- layers: " << NUM_LAYERS << "\n"
+                    << "- attention heads: " << NUM_HEADS << "\n"
+                    << "- embedding dimension: " << EMBEDDING_DIM << "\n"
+                    << "- sequence length: " << MAX_SEQ_LEN;
+                return ss.str();
+            }
+        };
+    };
+}
+
+// Define a cross-platform signal handling system
+namespace {
+    std::atomic<bool> g_terminate_flag(false);
+
+#ifdef _WIN32
+    // Windows-specific handler
+    BOOL WINAPI console_ctrl_handler(DWORD ctrl_type) {
+        if (ctrl_type == CTRL_C_EVENT) {
+            g_terminate_flag.store(true);
+            cout << "\nCtrl+C detected, cleaning up and closing the program..." << endl;
+            return TRUE;
+        }
+        return FALSE;
+    }
+#else
+    // Unix/Linux/macOS handler
+    void signal_handler(int signal) {
+        if (signal == SIGINT) {
+            g_terminate_flag.store(true);
+            cout << "\nCtrl+C detected, cleaning up and closing the program..." << endl;
+        }
+    }
+#endif
+
+    // Setup the interrupt handler based on platform
+    void setup_interrupt_handler() {
+#ifdef _WIN32
+        if (!SetConsoleCtrlHandler(console_ctrl_handler, TRUE)) {
+            cerr << "ERROR: Could not set control handler" << endl;
+        }
+#else
+        struct sigaction sa {};
+        sigemptyset(&sa.sa_mask);
+        sa.sa_handler = signal_handler;
+        sigaction(SIGINT, &sa, NULL);
+#endif
+    }
+}
+
+// Utility function to get file size
+size_t get_file_size(const std::string& filepath) {
+    std::ifstream file(filepath, std::ios::binary);
+    if (!file) return 0;
+    file.seekg(0, std::ios::end);
+    size_t file_size = file.tellg();
+    file.close();
+    return file_size;
+}
+
+// Function to generate tokens filename based on input file and size
+std::string generate_tokens_filename(const std::string& input_file, size_t max_bytes) {
+    // Extract base name from input file
+    std::string base_name = input_file;
+    size_t pos = base_name.find_last_of("/\\");
+    if (pos != std::string::npos) base_name = base_name.substr(pos + 1);
+
+    // Create filename with size information
+    std::string size_info = (max_bytes > 0) ? "partial" : "full";
+    return base_name + "." + size_info + ".tokens.bin";
+}
+
+// Function to save tokens to binary file
+bool save_tokens_to_file(const std::vector<int>& tokens, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file) {
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        return false;
+    }
+
+    // Write number of tokens
+    uint64_t num_tokens = tokens.size();
+    file.write(reinterpret_cast<const char*>(&num_tokens), sizeof(num_tokens));
+
+    // Write tokens
+    for (int token : tokens) {
+        uint32_t t = static_cast<uint32_t>(token);
+        file.write(reinterpret_cast<const char*>(&t), sizeof(t));
+    }
+    file.flush();
+    file.close();
+
+    return true;
+}
+
+// Function to load tokens from binary file
+bool load_tokens_from_file(std::vector<int>& tokens, const std::string& filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file) {
+        std::cerr << "Failed to open file for reading: " << filename << std::endl;
+        return false;
+    }
+
+    // Read number of tokens
+    uint64_t num_tokens;
+    file.read(reinterpret_cast<char*>(&num_tokens), sizeof(num_tokens));
+
+    // Read tokens
+    tokens.resize(num_tokens);
+    for (uint64_t i = 0; i < num_tokens; ++i) {
+        uint32_t t;
+        file.read(reinterpret_cast<char*>(&t), sizeof(t));
+        tokens[i] = static_cast<int>(t);
+    }
+    file.close();
+
+    return true;
+}
+
+// Function to read the "enwiki" file (entire or portion)
+std::string read_enwiki(const std::string& filepath, size_t max_bytes = 0) {
+    std::ifstream file(filepath, std::ios::binary);
+    if (!file) {
+        throw std::runtime_error("Cannot open enwiki file: " + filepath);
+    }
+    size_t file_size = get_file_size(filepath);
+
+    // If max_bytes is specified and valid, limit the reading
+    size_t bytes_to_read = (max_bytes > 0 && max_bytes < file_size) ? max_bytes : file_size;
+
+    std::string content(bytes_to_read, ' ');
+    file.read(&content[0], bytes_to_read);
+
+    return content;
+}
+
+// Function to verify byte-for-byte matching with detailed error reporting
+bool verify_match(const std::string& original, const std::string& generated) {
+    if (original.size() != generated.size()) {
+        cout << "Size mismatch: original=" << original.size()
+            << " bytes, generated=" << generated.size() << " bytes\n";
+        return false;
+    }
+
+    // Helper function to determine if a character is printable
+    auto is_printable = [](unsigned char c) { return c >= 32 && c < 127; };
+
+    // Helper function to format a byte as string (either character or hex)
+    auto format_byte = [&is_printable](unsigned char c) -> std::string {
+        if (is_printable(c)) {
+            return std::string(1, c);
+        }
+        else {
+            std::stringstream ss;
+            ss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
+            return ss.str();
+        }
+    };
+
+    // Helper function to display context around a position
+    auto show_context = [&](size_t pos, size_t context_size) {
+        size_t start = (pos >= context_size) ? pos - context_size : 0;
+        size_t end = std::min(original.size(), pos + context_size + 1);
+
+        std::string orig_context, gen_context;
+        std::string orig_highlight, gen_highlight;
+
+        for (size_t i = start; i < end; ++i) {
+            unsigned char orig_c = static_cast<unsigned char>(original[i]);
+            unsigned char gen_c = static_cast<unsigned char>(generated[i]);
+
+            orig_context += format_byte(orig_c);
+            gen_context += format_byte(gen_c);
+
+            if (i == pos) {
+                orig_highlight = format_byte(orig_c);
+                gen_highlight = format_byte(gen_c);
+            }
+        }
+
+        cout << "Context at position " << pos << ":\n";
+        cout << "Original (" << (int)original[pos] << " = '" << orig_highlight
+            << "'): " << orig_context << "\n";
+        cout << "Generated (" << (int)generated[pos] << " = '" << gen_highlight
+            << "'): " << gen_context << "\n";
+    };
+
+    size_t mismatch_count = 0;
+    const size_t max_detailed_mismatches = 10;  // Maximum number of detailed errors to display
+    const size_t context_size = 10;             // Number of characters to show before/after error
+
+    // Track error patterns
+    std::map<std::pair<char, char>, int> error_patterns;
+
+    // Analyze consecutive error regions
+    size_t current_region_start = 0;
+    size_t current_region_length = 0;
+    std::vector<std::pair<size_t, size_t>> error_regions; // (start, length)
+
+    for (size_t i = 0; i < original.size(); ++i) {
+        if (original[i] != generated[i]) {
+            // Track error pattern
+            error_patterns[{original[i], generated[i]}]++;
+
+            // Increment mismatch count
+            mismatch_count++;
+
+            // Handle error regions
+            if (current_region_length == 0) {
+                current_region_start = i;
+                current_region_length = 1;
+            }
+            else if (i == current_region_start + current_region_length) {
+                current_region_length++;
+            }
+            else {
+                // Save previous region and start new one
+                error_regions.push_back({ current_region_start, current_region_length });
+                current_region_start = i;
+                current_region_length = 1;
+            }
+
+            // Show detailed information for first few mismatches
+            if (mismatch_count <= max_detailed_mismatches) {
+                cout << "\n----- Mismatch #" << mismatch_count << " -----\n";
+                show_context(i, context_size);
+            }
+        }
+    }
+
+    // Add the last region if exists
+    if (current_region_length > 0) {
+        error_regions.push_back({ current_region_start, current_region_length });
+    }
+
+    if (mismatch_count > 0) {
+        cout << "\n===== Error Summary =====\n";
+        cout << "Total mismatches: " << mismatch_count << " bytes ("
+            << (mismatch_count * 100.0 / original.size()) << "%)\n";
+
+        // Report on error regions
+        cout << "\nFound " << error_regions.size() << " error regions:\n";
+        for (size_t i = 0; i < error_regions.size() && i < 20; ++i) {
+            cout << "  Region #" << (i + 1) << ": Position " << error_regions[i].first
+                << ", Length " << error_regions[i].second << "\n";
+        }
+        if (error_regions.size() > 20)
+            cout << "  ... and " << (error_regions.size() - 20) << " more regions\n";
+
+        // Report on most common error patterns
+        cout << "\nMost common error patterns (original -> generated):\n";
+        std::vector<std::pair<std::pair<char, char>, int>> patterns(
+            error_patterns.begin(), error_patterns.end());
+        std::sort(patterns.begin(), patterns.end(),
+            [](const auto& a, const auto& b) { return a.second > b.second; });
+
+        for (size_t i = 0; i < patterns.size() && i < 10; ++i) {
+            char orig = patterns[i].first.first;
+            char gen = patterns[i].first.second;
+            int count = patterns[i].second;
+
+            cout << "  '" << format_byte(static_cast<unsigned char>(orig)) << "' ("
+                << static_cast<int>(static_cast<unsigned char>(orig)) << ") -> '"
+                << format_byte(static_cast<unsigned char>(gen)) << "' ("
+                << static_cast<int>(static_cast<unsigned char>(gen)) << "): "
+                << count << " occurrences\n";
+        }
+
+        return false;
+    }
+
+    cout << "Files match perfectly. All " << original.size() << " bytes are identical.\n";
+    return true;
+}
+
+// ----------------------------------------------------------------------------------------
+
+int main(int argc, char** argv)
+{
+    try
+    {
+        // Setup interrupt handling for clean termination
+        setup_interrupt_handler();
+
+        command_line_parser parser;
+        parser.add_option("train", "Train a transformer model on enwiki");
+        parser.add_option("generate", "Generate enwiki from a previously trained model");
+        parser.add_option("verify", "Verify generated output against original data");
+        parser.add_option("tokenize-only", "Only tokenize the input file and save tokens");
+        parser.add_option("enwiki", "Path to the enwiki file (default: enwiki.txt)", 1);
+        parser.add_option("max-tokens", "Maximum number of tokens to load in memory", 1);
+        parser.add_option("max-bytes", "Maximum number of bytes to process from enwiki", 1);
+        parser.add_option("percent", "Percentage of enwiki to process (0-100)", 1);
+        parser.add_option("learning-rate", "Set the learning rate (default: 1e-4)", 1);
+        parser.add_option("batch-size", "Set the mini-batch size (default: 64)", 1);
+        parser.add_option("patience", "Iterations without progress before early stopping (default: 15000)", 1);
+        parser.add_option("max-epochs", "Maximum number of training epochs (default: 10)", 1);
+        parser.add_option("alpha", "Set the weight decay for Adam (default: 0.004)", 1);
+        parser.add_option("beta1", "Set Adam's first moment coefficient (default: 0.9)", 1);
+        parser.add_option("beta2", "Set Adam's second moment coefficient (default: 0.999)", 1);
+        parser.add_option("model-file", "Path for model (default: slm_enwiki_model.dat)", 1);
+        parser.add_option("output-file", "Path for output (default: enwiki_generated.txt)", 1);
+        parser.add_option("tokenizer", "Path to pre-trained tokenizer (default: enwiki_tokenizer.vocab)", 1);
+        parser.add_option("tokens-file", "Path to pre-tokenized tokens file (optional)", 1);
+        parser.add_option("force-tokenize", "Force tokenization even if tokens file exists");
+        parser.parse(argc, argv);
+
+        if (parser.number_of_arguments() == 0 &&
+            !parser.option("train") && !parser.option("generate") &&
+            !parser.option("verify") && !parser.option("tokenize-only"))
+        {
+            parser.print_options();
+            return 0;
+        }
+
+        // Default values
+        const double learning_rate = get_option(parser, "learning-rate", 1e-4);
+        const size_t batch_size = get_option(parser, "batch-size", 64);
+        const long patience = get_option(parser, "patience", 15000);
+        const size_t max_epochs = get_option(parser, "max-epochs", 10);
+        const double alpha = get_option(parser, "alpha", 0.004);
+        const double beta1 = get_option(parser, "beta1", 0.9);
+        const double beta2 = get_option(parser, "beta2", 0.999);
+        const std::string model_file = get_option(parser, "model-file", "slm_enwiki_model.dat");
+        const std::string output_file = get_option(parser, "output-file", "enwiki_generated.txt");
+        const std::string enwiki_path = get_option(parser, "enwiki", "enwiki.txt");
+        const long max_seq_len = 180;
+        const long num_layers = 2;
+        const long num_heads = 6;
+        const long embedding_dim = 228;
+        const std::string tokenizer_path = get_option(parser, "tokenizer", "enwiki_tokenizer.vocab");
+        // Default number of prompt tokens = input sequence length
+        const bool force_tokenize = parser.option("force-tokenize");
+        const long num_tokens = 1000;
+
+        // Calculate max bytes to process
+        size_t max_bytes = 0, max_tokens = 0;
+        if (parser.option("max-tokens"))
+            max_tokens = std::stoul(parser.option("max-tokens").argument());        
+        if (parser.option("max-bytes")) {
+            max_bytes = std::stoul(parser.option("max-bytes").argument());
+        }
+        else if (parser.option("percent")) {
+            double percent = std::stod(parser.option("percent").argument());
+            size_t file_size = get_file_size(enwiki_path);
+            if (file_size > 0) {
+                max_bytes = static_cast<size_t>(file_size * percent / 100.0);
+                cout << "Processing " << percent << "% of enwiki = " << max_bytes << " bytes\n";
+            }
+            else {
+                cerr << "Warning: Cannot determine file size for percentage calculation\n";
+            }
+        }
+
+        // Tokenizer BPE
+        bpe_tokenizer tokenizer;
+
+        // Load pre-trained tokenizer
+        if (file_exists(tokenizer_path)) {
+            cout << "Loading pre-trained tokenizer from: " << tokenizer_path << endl;
+            deserialize(tokenizer_path) >> tokenizer;
+            cout << "Tokenizer loaded successfully with vocabulary size: " << tokenizer.get_vocab_size() << endl;
+        }
+        else {
+            cout << "Pre-trained tokenizer not found at: " << tokenizer_path << endl;
+            cout << "Will train a new tokenizer if in training mode." << endl;
+        }
+
+        // Determine tokens filename
+        std::string tokens_file = parser.option("tokens-file") ?
+            parser.option("tokens-file").argument() :
+            generate_tokens_filename(enwiki_path, max_bytes);
+
+        using enwiki_transformer = transformer_config<
+            num_tokens,     // vocab_size
+            num_layers,     // number of layers
+            num_heads,      // number of attention heads
+            embedding_dim,  // embedding dimension
+            max_seq_len     // maximum sequence length
+        >;
+
+        // For GPU usage (if available)
+        std::vector<int> gpus{ 0 };
+
+        // Variables to store tokens (used in multiple modes)
+        std::vector<int> full_tokens;
+        bool tokens_loaded = false;
+
+        // ----------------------------------------------------------------------------------------
+        // Tokenize-only mode
+        // ----------------------------------------------------------------------------------------
+        if (parser.option("tokenize-only")) {
+            cout << "=== TOKENIZE-ONLY MODE ===\n";
+
+            // 1) Read the enwiki file (or portion)
+            cout << "Reading enwiki file from: " << enwiki_path;
+            if (max_bytes > 0) cout << " (limited to " << max_bytes << " bytes)";
+            cout << endl;
+
+            std::string enwiki_text = read_enwiki(enwiki_path, max_bytes);
+            cout << "Read " << enwiki_text.size() << " bytes\n";
+
+            // 2) Train a new tokenizer if needed
+            if (!file_exists(tokenizer_path)) {
+                cout << "Training new BPE tokenizer with vocabulary size " << num_tokens << "...\n";
+                tokenizer.train(enwiki_text, num_tokens, true);
+                serialize(tokenizer_path) << tokenizer;
+                cout << "Tokenizer saved to " << tokenizer_path << endl;
+            }
+
+            // 3) Tokenize the full text
+            cout << "Tokenizing input text...\n";
+            auto start_time = std::chrono::high_resolution_clock::now();
+            int text_start_id = tokenizer.get_special_token_id("<text>"),
+                text_end_id = tokenizer.get_special_token_id("</text>");
+            if (text_start_id < 0 || text_end_id < 0)
+                cout << "Warning: Special tokens not found in tokenizer vocabulary.\n";
+            full_tokens.clear();
+            full_tokens.push_back(text_start_id);
+            auto encoded_tokens = tokenizer.encode_raw(enwiki_text);
+            full_tokens.insert(full_tokens.end(), encoded_tokens.begin(), encoded_tokens.end());
+            full_tokens.push_back(text_end_id);
+            auto end_time = std::chrono::high_resolution_clock::now();
+            auto tokenize_time = std::chrono::duration_cast<std::chrono::seconds>(end_time - start_time).count();
+
+            cout << "Tokenization completed in " << tokenize_time << " seconds.\n";
+            cout << "Number of tokens: " << full_tokens.size() << endl;
+
+            // 4) Save tokens
+            cout << "Saving tokens to file: " << tokens_file << endl;
+            if (save_tokens_to_file(full_tokens, tokens_file)) {
+                cout << "Tokens successfully saved.\n";
+            }
+            else {
+                cerr << "Failed to save tokens.\n";
+            }
+
+            return 0;
+        }
+
+        // ----------------------------------------------------------------------------------------
+        // Training mode
+        // ----------------------------------------------------------------------------------------
+        if (parser.option("train"))
+        {
+            cout << "=== TRAINING MODE ===\n";
+
+            // Check if we should load pre-tokenized tokens
+            if (!force_tokenize && file_exists(tokens_file)) {
+                cout << "Found pre-tokenized tokens file: " << tokens_file << endl;
+                cout << "Loading tokens from file...\n";
+                if (load_tokens_from_file(full_tokens, tokens_file)) {
+                    cout << "Loaded " << full_tokens.size() << " tokens from file.\n";
+                    if (max_tokens > 0 && max_tokens < full_tokens.size()) {
+                        full_tokens.resize(max_tokens);
+                        cout << "But limited to " << full_tokens.size() << " tokens for training.\n";
+                    }
+                    tokens_loaded = true;
+                }
+                else {
+                    cerr << "Failed to load tokens from file. Will tokenize again.\n";
+                }
+            }
+
+            if (!tokens_loaded) {
+                // 1) Read the enwiki file (or portion)
+                cout << "Reading enwiki file from: " << enwiki_path;
+                if (max_bytes > 0) cout << " (limited to " << max_bytes << " bytes)";
+                cout << endl;
+
+                std::string enwiki_text = read_enwiki(enwiki_path, max_bytes);
+                cout << "Read " << enwiki_text.size() << " bytes\n";
+
+                // 2) Train a new tokenizer if needed
+                if (!file_exists(tokenizer_path)) {
+                    cout << "Training new BPE tokenizer with vocabulary size " << num_tokens << "...\n";
+                    tokenizer.train(enwiki_text, num_tokens, true);
+                    serialize(tokenizer_path) << tokenizer;
+                    cout << "Tokenizer saved to " << tokenizer_path << endl;
+                }
+
+                // 3) Tokenize the full text
+                cout << "Tokenizing input text...\n";
+                int text_start_id = tokenizer.get_special_token_id("<text>"),
+                    text_end_id = tokenizer.get_special_token_id("</text>");
+                if (text_start_id < 0 || text_end_id < 0)
+                    cout << "Warning: Special tokens not found in tokenizer vocabulary.\n";
+                auto start_time = std::chrono::high_resolution_clock::now();
+                full_tokens.clear();
+                full_tokens.push_back(text_start_id);
+                auto encoded_tokens = tokenizer.encode_raw(enwiki_text);
+                full_tokens.insert(full_tokens.end(), encoded_tokens.begin(), encoded_tokens.end());
+                full_tokens.push_back(text_end_id);
+                auto end_time = std::chrono::high_resolution_clock::now();
+                auto tokenize_time = std::chrono::duration_cast<std::chrono::seconds>(end_time - start_time).count();
+
+                cout << "Tokenization completed in " << tokenize_time << " seconds.\n";
+                cout << "Number of tokens: " << full_tokens.size() << endl;
+
+                // Save tokens for future use
+                cout << "Saving tokens to file: " << tokens_file << endl;
+                if (save_tokens_to_file(full_tokens, tokens_file)) {
+                    cout << "Tokens successfully saved for future use.\n";
+                }
+                else {
+                    cerr << "Warning: Failed to save tokens for future use.\n";
+                }
+            }
+
+            // 4) Prepare training sequences (sliding window)
+            cout << "Preparing training sequences...\n";
+            std::vector<matrix<int, 0, 1>> samples;
+            std::vector<unsigned long> labels;
+
+            // Calculate the maximum number of sequences we can create
+            size_t num_sequences = full_tokens.size() - max_seq_len;
+            if (num_sequences <= 0) {
+                cerr << "Error: Not enough tokens to create training sequences. Need at least "
+                    << (max_seq_len + 1) << " tokens.\n";
+                return 1;
+            }
+
+            cout << "Creating training samples...\n";
+
+            // For very large datasets, using a stride can reduce training time 
+            // without significantly affecting model quality
+            size_t stride = 1;  // Default: use every possible sequence
+            const size_t max_samples = 10000000;  // Optional: limit total samples to prevent memory issues
+
+            // If dataset is very large, use adaptive stride
+            if (num_sequences > max_samples && max_samples > 0) {
+                stride = num_sequences / max_samples + 1;
+                cout << "Dataset is large. Using stride of " << stride
+                    << " to limit samples to approximately " << max_samples << "\n";
+            }
+
+            // Reserve memory for better performance
+            samples.reserve(num_sequences / stride + 1);
+            labels.reserve(num_sequences / stride + 1);
+
+            // Create training samples with stride
+            for (size_t start = 0; start < num_sequences; start += stride) {
+                matrix<int, 0, 1> seq(max_seq_len, 1);
+                for (long t = 0; t < max_seq_len; ++t) {
+                    seq(t, 0) = full_tokens[start + t];
+                }
+                samples.push_back(seq);
+                labels.push_back(full_tokens[start + max_seq_len]);
+
+                if (samples.size() % 10000 == 0) {
+                    cout << "Created " << samples.size() << " training samples ("
+                        << (start * 100 / num_sequences) << "%)...\r";
+                }
+            }
+            full_tokens.clear();
+            cout << "Created " << samples.size() << " training samples (100%)...\n";
+
+            // 5) Build and train the network
+            using net_type = enwiki_transformer::network_type<true>;
+            net_type net;
+            cout << "Model architecture:\n" << enwiki_transformer::model_info::describe() << endl;
+            if (file_exists(model_file)) deserialize(model_file) >> net;
+
+            // Create trainer
+            dnn_trainer<net_type, adam> trainer(net, adam(alpha, beta1, beta2), gpus);
+            trainer.set_learning_rate(learning_rate);
+            trainer.set_min_learning_rate(1e-6);
+            trainer.set_mini_batch_size(batch_size);
+            // For perfect memorization, we allow more epochs without improvement
+            trainer.set_iterations_without_progress_threshold(patience);
+            trainer.set_max_num_epochs(max_epochs); // More epochs for perfect memorization
+            trainer.set_synchronization_file("enwiki_trainer.sync", std::chrono::minutes(10));
+            trainer.be_quiet();
+
+            // Custom training loop - trainer.train(samples, labels)
+            cout << "Starting training...\n";            
+            size_t epoch = 0, samples_seen = 0, batches_seen = 0;
+            double total_loss = 0;
+            auto start_time = std::chrono::steady_clock::now();
+
+            // Shuffle indices for epoch
+            std::vector<size_t> indices(samples.size());
+            std::iota(indices.begin(), indices.end(), 0);
+
+            while (epoch < max_epochs && trainer.get_learning_rate() >= trainer.get_min_learning_rate()
+                && !g_terminate_flag.load())
+            {
+                // Shuffle for new epoch
+                std::shuffle(indices.begin(), indices.end(), std::default_random_engine{});
+
+                // Process mini-batches
+                for (size_t i = 0; i < samples.size() && !g_terminate_flag.load(); i += batch_size)
+                {
+                    // Get current mini-batch
+                    std::vector<matrix<int, 0, 1>> batch_samples;
+                    std::vector<unsigned long> batch_labels;
+
+                    batch_samples.reserve(batch_size);
+                    batch_labels.reserve(batch_size);
+
+                    for (size_t j = 0; j < batch_size; ++j) {
+                        size_t pos = (i + j) >= indices.size() ? j : (i + j);
+                        batch_samples.push_back(samples[indices[pos]]);
+                        batch_labels.push_back(labels[indices[pos]]);
+                    }
+
+                    // Train on this batch
+                    trainer.train_one_step(batch_samples, batch_labels);
+                    double loss = trainer.get_average_loss();
+
+                    // Update stats
+                    total_loss += loss;
+                    samples_seen += batch_size;
+                    batches_seen++;
+
+                    // Progress reporting
+                    if (batches_seen % 100 == 0) {
+                        auto now = std::chrono::steady_clock::now();
+                        auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(now - start_time).count();
+                        double avg_loss = total_loss / batches_seen;
+                        double samples_per_sec = samples_seen / (elapsed > 0 ? elapsed : 1);
+
+                        cout << "epoch#: " << (epoch + 1) << "/" << max_epochs
+                            << " \t batch: " << batches_seen
+                            << " \t samples: " << samples_seen
+                            << " \t loss: " << avg_loss
+                            << " \t speed: " << samples_per_sec << " samples/sec\n";
+                        cout.flush();
+                    }
+                }
+                epoch++;
+
+                // Evaluate progress at end of epoch
+                cout << ">>> completed epoch " << epoch << " - average loss: " << (total_loss / batches_seen) << endl;
+            }
+
+            // Save model
+            net.clean();
+            serialize(model_file) << net;
+            cout << "Model saved to " << model_file << "\n";
+            std::remove("enwiki_trainer.sync");
+            std::remove("enwiki_trainer.sync_");
+
+            // Evaluate on training set
+            {
+                if (!g_terminate_flag.load()) {
+                    cout << "Evaluating model accuracy...\n";
+                    using net_infer = enwiki_transformer::network_type<false>;
+                    net_infer g_infer = net;
+                    auto predicted = g_infer(samples);
+                    size_t correct = 0;
+                    for (size_t i = 0; i < labels.size(); ++i)
+                        if (predicted[i] == labels[i]) correct++;
+                    double accuracy = (double)correct / labels.size();
+                    cout << "Training accuracy: " << (accuracy * 100.0) << "%\n";
+
+                    // We need perfect accuracy to reconstruct enwiki
+                    if (accuracy < 0.9999) {
+                        cout << "WARNING: Model accuracy is less than 99.99%. The model may not "
+                            << "perfectly reconstruct the input text.\n";
+                    }
+                }
+            }            
+        }
+
+        // ----------------------------------------------------------------------------------------
+        // Generation mode
+        // ----------------------------------------------------------------------------------------
+        if (parser.option("generate"))
+        {
+            cout << "=== GENERATION MODE ===\n";
+
+            // 1) Load the model
+            using net_infer = enwiki_transformer::network_type<false>;
+            net_infer net;
+            if (file_exists(model_file)) {
+                deserialize(model_file) >> net;
+                cout << "Loaded model from " << model_file << "\n";
+            }
+            else {
+                cerr << "Error: model file not found. Please run --train first.\n";
+                return 0;
+            }
+
+            // 2) Check that tokenizer is loaded
+            if (tokenizer.get_vocab_size() == 0) {
+                cerr << "Error: Tokenizer not loaded. Please provide a valid tokenizer file.\n";
+                return 0;
+            }
+
+            // 3) Read beginning of enwiki file for prompt
+            std::vector<int> prompt_tokens;
+
+            // Check if we have pre-tokenized tokens
+            if (file_exists(tokens_file)) {
+                cout << "Found pre-tokenized tokens file: " << tokens_file << endl;
+                cout << "Loading tokens for prompt...\n";
+
+                // We only need max_seq_len tokens, so we can load
+                // just the necessary part of the file
+                std::ifstream file(tokens_file, std::ios::binary);
+                if (!file) {
+                    cerr << "Failed to open tokens file: " << tokens_file << endl;
+                }
+                else {
+                    // Read total number of tokens
+                    uint64_t num_tokens;
+                    file.read(reinterpret_cast<char*>(&num_tokens), sizeof(num_tokens));
+
+                    // Read only the first max_seq_len tokens
+                    size_t tokens_to_read = std::min(static_cast<size_t>(max_seq_len), static_cast<size_t>(num_tokens));
+                    prompt_tokens.resize(tokens_to_read);
+
+                    for (size_t i = 0; i < tokens_to_read; ++i) {
+                        uint32_t t;
+                        file.read(reinterpret_cast<char*>(&t), sizeof(t));
+                        prompt_tokens[i] = static_cast<int>(t);
+                    }
+
+                    cout << "Loaded " << prompt_tokens.size() << " tokens for prompt from file.\n";
+                }
+            }
+
+            // If we couldn't load tokens, tokenize the prompt text
+            if (prompt_tokens.empty()) {
+                cout << "Reading initial prompt from enwiki...\n";
+                std::string enwiki_prompt;
+
+                if (file_exists(enwiki_path)) {
+                    // Read a portion large enough to cover the first tokens
+                    std::ifstream file(enwiki_path, std::ios::binary);
+                    // Buffer intentionally large to ensure we have enough text for tokens
+                    char buffer[max_seq_len * 10];
+                    file.read(buffer, sizeof(buffer));
+                    size_t bytes_read = file.gcount();
+                    enwiki_prompt = std::string(buffer, bytes_read);
+                }
+                else {
+                    cerr << "Error: Cannot find original enwiki file for initial prompt.\n";
+                    return 0;
+                }
+
+                // Tokenize the prompt
+                cout << "Tokenizing prompt...\n";
+                int text_start_id = tokenizer.get_special_token_id("<text>");
+                prompt_tokens.clear();                
+                prompt_tokens.push_back(text_start_id);
+                auto encoded_tokens = tokenizer.encode_raw(enwiki_prompt);
+                prompt_tokens.insert(prompt_tokens.end(), encoded_tokens.begin(), encoded_tokens.end());
+            }
+
+            // Limit to requested number of tokens (exact, no padding)
+            if (prompt_tokens.size() > (size_t)max_seq_len) {
+                prompt_tokens.resize(max_seq_len);
+            }
+            else if (prompt_tokens.size() < (size_t)max_seq_len) {
+                cerr << "Warning: Not enough tokens in prompt. Got " << prompt_tokens.size()
+                    << ", needed " << max_seq_len << ". Consider using a larger input file.\n";
+                return 0;
+            }
+            cout << "Using " << prompt_tokens.size() << " tokens for initial prompt\n";
+
+            // 5) Put prompt in input sequence
+            matrix<int, 0, 1> input_seq(max_seq_len, 1);
+            for (long i = 0; i < max_seq_len; ++i) {
+                input_seq(i, 0) = prompt_tokens[i];
+            }
+
+            // 6) Determine text size to generate
+            size_t target_size = 0;
+            if (max_bytes > 0) {
+                target_size = max_bytes;
+            }
+            else {
+                // Default: generate 1K or original file size
+                target_size = get_file_size(enwiki_path);
+                if (target_size == 0) {
+                    target_size = 1024;
+                }
+            }
+            cout << "Will generate approximately " << target_size << " bytes\n";
+
+            // 7) Open output file
+            std::ofstream outfile(output_file, std::ios::binary);
+            if (!outfile) {
+                cerr << "Error: Cannot open output file: " << output_file << "\n";
+                return 0;
+            }
+
+            // 8) Write initial text (corresponding to prompt tokens)
+            std::string initial_text = tokenizer.decode(prompt_tokens, false);
+            outfile.write(initial_text.c_str(), initial_text.size());
+
+            // 9) Generate the rest of the text autoregressively
+            cout << "Starting autoregressive generation...\n";
+
+            // Buffer for accumulation before writing
+            std::vector<int> token_buffer;
+            const size_t buffer_size = 100;
+
+            // Save start time to measure execution time
+            auto start_time = std::chrono::high_resolution_clock::now();
+            size_t total_bytes = initial_text.size();
+            size_t token_count = prompt_tokens.size();
+
+            // Generate until target size is reached
+            int start_of_text = tokenizer.get_special_token_id("<text>"),
+                end_of_text = tokenizer.get_special_token_id("</text>"), next_token = 0;
+            while (total_bytes < target_size && next_token != start_of_text && next_token != end_of_text
+                && !g_terminate_flag.load()) {
+                // Predict next token
+                next_token = net(input_seq);
+                token_buffer.push_back(next_token);
+                token_count++;
+
+                // Shift the input window
+                for (long i = 0; i < max_seq_len - 1; ++i)
+                    input_seq(i, 0) = input_seq(i + 1, 0);
+                input_seq(max_seq_len - 1, 0) = next_token;
+
+                // If buffer is full, write to file
+                if (token_buffer.size() >= buffer_size) {
+                    std::string chunk = tokenizer.decode(token_buffer, false);
+                    outfile.write(chunk.c_str(), chunk.size());
+                    total_bytes += chunk.size();
+                    token_buffer.clear();
+
+                    // Display progress
+                    auto current_time = std::chrono::high_resolution_clock::now();
+                    auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+                        current_time - start_time).count();
+                    double tokens_per_second = (token_count - prompt_tokens.size()) / (elapsed > 0 ? elapsed : 1);
+
+                    cout << "Generated " << (token_count - prompt_tokens.size()) << " tokens, "
+                        << total_bytes << " bytes ("
+                        << (total_bytes * 100.0 / target_size) << "%) - "
+                        << tokens_per_second << " tokens/sec - "
+                        << "Est. completion: "
+                        << (int)((target_size - total_bytes) / (tokens_per_second * (chunk.size() / (double)buffer_size)))
+                        << " seconds\r";
+                }
+                if (max_tokens > 0 && token_count >= max_tokens) break;
+            }
+
+            // Flush remaining buffer
+            if (!token_buffer.empty()) {
+                std::string chunk = tokenizer.decode(token_buffer, false);
+                outfile.write(chunk.c_str(), chunk.size());
+                total_bytes += chunk.size();
+            }
+            outfile.flush();
+            outfile.close();
+
+            auto end_time = std::chrono::high_resolution_clock::now();
+            auto total_time = std::chrono::duration_cast<std::chrono::seconds>(
+                end_time - start_time).count();
+
+            cout << "Generation complete in " << total_time << " seconds!\n";
+            cout << "Generated " << (token_count - prompt_tokens.size()) << " tokens after prompt, "
+                << total_bytes << " bytes total\n";
+            cout << "Output saved to " << output_file << "\n";
+        }
+
+        // ----------------------------------------------------------------------------------------
+        // Verification mode - Compare original and generated file
+        // ----------------------------------------------------------------------------------------
+        if (parser.option("verify"))
+        {
+            cout << "=== VERIFicAtiON MODE ===\n";
+
+            if (!file_exists(enwiki_path)) {
+                cerr << "Error: Original enwiki file not found at " << enwiki_path << "\n";
+                return 0;
+            }
+
+            if (!file_exists(output_file)) {
+                cerr << "Error: Generated file not found at " << output_file << "\n";
+                return 0;
+            }
+
+            // Read generated file
+            cout << "Reading generated file...\n";
+            std::string generated = read_enwiki(output_file);
+
+            // Read the same portion of original file
+            cout << "Reading original file (same size as generated)...\n";
+            std::string original = read_enwiki(enwiki_path, generated.size());
+
+            cout << "Verifying byte-for-byte match...\n";
+            bool match = verify_match(original, generated);
+
+            if (match)
+                cout << "SUCCESS: The generated file matches the original text perfectly!\n";
+            else
+                cout << "FAILED: The generated file does not match the original text.\n";
+        }
+
+        return 0;
+    }
+    catch (exception& e)
+    {
+        cerr << "Exception thrown: " << e.what() << endl;
+        return 1;
+    }
+}
\ No newline at end of file