From 8d0aa3621c9ccd039240945b50816aac6a8d8005 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Sat, 22 Feb 2025 14:26:34 +0800
Subject: [PATCH 01/17] feat: add max pool

---
 include/modules/layers/max_pool.hpp | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 include/modules/layers/max_pool.hpp
diff --git a/include/modules/layers/max_pool.hpp b/include/modules/layers/max_pool.hpp
new file mode 100644
index 0000000..93ad1e1
--- /dev/null
+++ b/include/modules/layers/max_pool.hpp
@@ -0,0 +1,25 @@
+#pragma once
+#include "module.hpp"
+#include "conv2d_utils.hpp"
+
+namespace nn
+{
+    class MaxPool2d : public Module
+    {
+    public:
+        MaxPool2d(var_pair kernel_size, var_pair stride, var_pair padding, const string &padding_mode = "zeros");
+
+        virtual Tensor<> forward(const Tensor<> &input) override;
+        virtual Tensor<> backward(const Tensor<> &grad_output) override;
+        virtual void update_params(const float lr) override;
+
+    private:
+        size_tp2 kernel_size_;
+        size_tp2 stride_;
+        size_tp2 padding_;
+        PaddingMode padding_mode_;
+        Padding padding_module_;
+        vector<size_t> original_input_shape_;
+        Tensor<> grad_input_;
+    };
+}
\ No newline at end of file

From f60967977b8663012a430028851df7b8e4f14241 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Sat, 22 Feb 2025 14:26:43 +0800
Subject: [PATCH 02/17] refractor

---
 src/modules/activations/softmax.cpp  | 2 +-
 src/modules/losses/cross_entropy.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/modules/activations/softmax.cpp b/src/modules/activations/softmax.cpp
index 6fe51c2..d9bf626 100644
--- a/src/modules/activations/softmax.cpp
+++ b/src/modules/activations/softmax.cpp
@@ -14,7 +14,7 @@ Tensor<> Softmax::softmax_helper(const Tensor<> &input)
                                 { return exp(x); });
     double sum = result.sum();
 
-    return result * (1 / sum);
+    return result / sum;
 }
 
 vector<double> Softmax::softmax_helper(const vector<double> &input)
diff --git a/src/modules/losses/cross_entropy.cpp b/src/modules/losses/cross_entropy.cpp
index b20ed72..c028c89 100644
--- a/src/modules/losses/cross_entropy.cpp
+++ b/src/modules/losses/cross_entropy.cpp
@@ -81,7 +81,7 @@ Tensor<> CrossEntropyLoss::backward()
         grad_output[i, static_cast<int>(this->Y_cache_[i])] -= 1.0f;
     }
 
-    grad_output *= 1.0f / B;
+    grad_output /= B;
 
     return grad_output;
 }
\ No newline at end of file

From a11f99b746ff4c353f0edc011249c593d1137405 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Sat, 22 Feb 2025 14:26:51 +0800
Subject: [PATCH 03/17] feat: add max pool

---
 src/modules/layers/max_pool.cpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 src/modules/layers/max_pool.cpp

diff --git a/src/modules/layers/max_pool.cpp b/src/modules/layers/max_pool.cpp
new file mode 100644
index 0000000..e69de29

From dc667a327e36ed4e5b5127d53ec3dfa1fdf35310 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Sat, 22 Feb 2025 14:27:05 +0800
Subject: [PATCH 04/17] refractor

---
 include/models/mlp.hpp            |  1 +
 include/modules/layers/conv2d.hpp | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/models/mlp.hpp b/include/models/mlp.hpp
index 2ce365e..b3bbf8b 100644
--- a/include/models/mlp.hpp
+++ b/include/models/mlp.hpp
@@ -2,6 +2,7 @@
 #include "module.hpp"
 using namespace nn;
 
+
 class MLP : public Module {
     public:
         MLP(vector<size_t> layer_sizes, double dropout_p = 0.5);
diff --git a/include/modules/layers/conv2d.hpp b/include/modules/layers/conv2d.hpp
index 1868be9..e70ff59 100644
--- a/include/modules/layers/conv2d.hpp
+++ b/include/modules/layers/conv2d.hpp
@@ -2,7 +2,6 @@
 #include <utility>
 #include "module.hpp"
 #include "conv2d_utils.hpp"
-using namespace nn;
 
 namespace nn
 {
@@ -25,11 +24,13 @@ namespace nn
 
         void reset_parameters();
 
-        void set_weight(const Tensor<> &target_weight) { this->weight_ = target_weight; }
-        void set_bias(const Tensor<> &target_bias) { this->bias_ = target_bias; }
+        // Setters
+        inline void set_weight(const Tensor<> &target_weight) { this->weight_ = target_weight; }
+        inline void set_bias(const Tensor<> &target_bias) { this->bias_ = target_bias; }
 
-        const Tensor<> &get_weight() const { return this->weight_; }
-        const Tensor<> &get_bias() const { return this->bias_; }
+        // Getters
+        inline const Tensor<> &get_weight() const { return this->weight_; }
+        inline const Tensor<> &get_bias() const { return this->bias_; }
 
     private:
         size_t in_channels_;
@@ -42,6 +43,7 @@ namespace nn
         PaddingMode padding_mode_;
         Padding padding_module_;
         vector<size_t> original_input_shape_;
+
         Tensor<> weight_;
         Tensor<> bias_;
         Tensor<> grad_weight_;

From d2e1d40229ef80f7cce2559095a88851630407fe Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Sat, 22 Feb 2025 14:27:20 +0800
Subject: [PATCH 05/17] Will be implemented

---
 include/models/cnn.hpp | 19 +++++++++++++++++++
 src/models/cnn.cpp     |  0
 2 files changed, 19 insertions(+)
 create mode 100644 include/models/cnn.hpp
 create mode 100644 src/models/cnn.cpp

diff --git a/include/models/cnn.hpp b/include/models/cnn.hpp
new file mode 100644
index 0000000..c9f955a
--- /dev/null
+++ b/include/models/cnn.hpp
@@ -0,0 +1,19 @@
+#pragma once
+#include "module.hpp"
+using namespace nn;
+
+class CNN : public Module
+{
+public:
+    CNN(vector<size_t> layer_sizes, double dropout_p = 0.5);
+    CNN(initializer_list<size_t> layer_sizes, double dropout_p = 0.5);
+    ~CNN();
+
+    virtual Tensor<> forward(const Tensor<> &input) override;
+    virtual Tensor<> backward(const Tensor<> &grad_output) override;
+    virtual void update_params(const float lr) override;
+
+private:
+    vector<Module *> layers_;
+    int num_layers_;
+};
diff --git a/src/models/cnn.cpp b/src/models/cnn.cpp
new file mode 100644
index 0000000..e69de29

From d33cc394098a4d481f29ecdc7e8ed7c7376adf89 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 17:18:46 +0800
Subject: [PATCH 06/17] feat: add containers dependency

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc64f52..f8f8279 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ endif()
 include_directories(
     include/
     include/core
+    include/modules/containers
     include/modules/layers
     include/modules/activations
     include/modules/losses
@@ -34,6 +35,7 @@ set(SOURCE_FILES
     src/core/tensor.cpp
     src/utils/tensor_utils.cpp
     src/core/module.cpp
+    src/modules/containers/sequential.cpp
     src/modules/layers/linear.cpp
     src/modules/layers/conv2d.cpp
     src/modules/layers/flatten.cpp
@@ -54,7 +56,7 @@ set(SOURCE_FILES
 add_library(neuralnet ${SOURCE_FILES})
 
 # Add the executable for the main example
-add_executable(main examples/test_conv2d.cpp)
+add_executable(main examples/main.cpp)
 target_link_libraries(main neuralnet)
 
 # Only build tests if BUILD_TESTS is ON

From aff225502b2c9a67ab052ba824ea1fbce2e28fc6 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 22:42:40 +0800
Subject: [PATCH 07/17] fix: update mlp initialization

---
 examples/main.cpp | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/examples/main.cpp b/examples/main.cpp
index 57a301f..d00d6a5 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -11,10 +11,10 @@ int main()
 
     // Define the hyperparameters
 
-    const double LR = 0.01;
-    const double EPOCH = 10;
-    const double BATCH_SIZE = 64;
-    const double DROPOUT_P = 0.3;
+    const float LR = 0.01;
+    const float EPOCH = 10;
+    const float BATCH_SIZE = 64;
+    const float DROPOUT_P = 0.3;
 
     MNIST dataset(BATCH_SIZE);
 
@@ -29,7 +29,8 @@ int main()
     }
 
     // Initialize the model
-    MLP model = MLP({784, 128, 64, 10}, DROPOUT_P);
+    bool bias = true;
+    MLP model = MLP(784, {128, 64, 10}, bias, DROPOUT_P);
 
     cout << "Finished model initialization" << endl;
 
@@ -38,14 +39,15 @@ int main()
 
     cout << "Finished loss initialization" << endl;
 
-    double loss = 0.0;
-    double acc = 0.0;
-    vector<double> loss_list;
-    vector<double> accuracy_list;
+    float loss = 0.0;
+    float acc = 0.0;
+    vector<float> loss_list;
+    vector<float> accuracy_list;
 
     cout << "Training started..." << endl;
 
-    // // Train the model
+    // ============================ Training ====================================
+
     // Example of iterating through all batches
     for (size_t e = 0; e < EPOCH; e++)
     {
@@ -77,8 +79,8 @@ int main()
             print_stats_line(i, loss, acc);
         }
 
-        double total_loss = accumulate(loss_list.begin(), loss_list.end(), 0.0) / loss_list.size();
-        double total_acc = accumulate(accuracy_list.begin(), accuracy_list.end(), 0.0) / accuracy_list.size() * 100;
+        float total_loss = accumulate(loss_list.begin(), loss_list.end(), 0.0) / loss_list.size();
+        float total_acc = accumulate(accuracy_list.begin(), accuracy_list.end(), 0.0) / accuracy_list.size() * 100;
 
         cout << "------------------------------------" << endl;
         cout << "Total Loss in Epoch " << e + 1 << " = " << total_loss << "" << endl;
@@ -86,7 +88,7 @@ int main()
         cout << "------------------------------------" << endl;
     }
 
-    // Inference
+    // ============================ Inference ====================================
 
     model.eval();
 
@@ -127,8 +129,11 @@ int main()
         print_stats_line(i, loss, acc);
     }
 
-    double total_loss = accumulate(loss_list.begin(), loss_list.end(), 0.0) / loss_list.size();
-    double total_acc = accumulate(accuracy_list.begin(), accuracy_list.end(), 0.0) / accuracy_list.size() * 100;
+    float total_loss = accumulate(loss_list.begin(), loss_list.end(), 0.0) / loss_list.size();
+    float total_acc = accumulate(accuracy_list.begin(), accuracy_list.end(), 0.0) / accuracy_list.size() * 100;
+
+    cout << "Average Loss on Test Data = " << total_loss << "" << endl;
+    cout << "Average Accuracy on Test Data = " << total_acc << "%" << endl;
 
     cout << "------------------------------------" << endl;
 

From 3af465e83974c40a819d878cc4fa372391b4d30b Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 22:43:07 +0800
Subject: [PATCH 08/17] feat: update tensor copy constructor and add move
 constructor

---
 docs/tensor.md          | 35 ++++++++++++++----
 include/core/tensor.hpp | 79 +++++++++++++++++++++++------------------
 2 files changed, 73 insertions(+), 41 deletions(-)

diff --git a/docs/tensor.md b/docs/tensor.md
index 0d6f62e..8e8d10d 100644
--- a/docs/tensor.md
+++ b/docs/tensor.md
@@ -26,7 +26,7 @@ You can create your tensor from C++ array, or using `vector` in C++ STL. You can
 ```cpp
 #include "tensor.hpp"
 
-// default type is double
+// default type is float
 Tensor<> your_tensor = { { 1.2, 2.3, 3.4 }, { 4.5, 5.6, 6.7 } }; // shape: (2, 3)
 
 // Or you can create a tensor with a specific type
@@ -36,7 +36,7 @@ Tensor<int> your_int_tensor = { { 1, 2, 3 }, { 4, 5, 6 }, { 7, 8, 9 } } // shape
 Tensor<> transposed_tensor = your_tensor.transpose(); // shape: (3, 2)
 
 // You can also create a tensor from a vector
-vector<vector<double>> your_vec = { { 1.2, 2.3, 3.4 }, { 4.5, 5.6, 6.7 } };
+vector<vector<float>> your_vec = { { 1.2, 2.3, 3.4 }, { 4.5, 5.6, 6.7 } };
 Tensor<> your_tensor_from_vec = Tensor<>(your_vec);
 ```
 
@@ -172,7 +172,7 @@ Tensor<int> A = { { 1, 2, 3 },
 
 Tensor<float> A_float = A.dtype<float>();
 
-Tensor<> A_double = A.dtype<double>(); // since the default type of tensor is double
+Tensor<> A_float = A.dtype<float>(); // since the default type of tensor is float
 ```
 
 ## Filter the unwanted elements
@@ -200,7 +200,7 @@ Function mapping also can be applied to the tensor, simply by using `map`. It ta
 Tensor<> A = { { 1, 2, 3 },
                { 4, 5, 6 } }; // 2 x 3
 
-Tensor<> A_mapped = A.map([](double x) { return exp(x); });
+Tensor<> A_mapped = A.map([](float x) { return exp(x); });
 /*
 { { 2.71828, 7.38906, 20.0855 },
   { 54.5982, 148.413, 403.429 } }
@@ -245,12 +245,35 @@ Tensor<size_t> tensor_1d_argmax = tensor_1d.argmin();
 
 ## Flatten tensor
 
-You can flatten your tensor using `flatten` function. It returns a 1-D tensor.
+You can flatten your tensor using `flatten` function. It flattens the dimensions of the tensor from start_dim to end_dim into a single dimension. Default of start_dim and end_dim is 0 and -1 respectively.
 
 ```cpp
 Tensor<int> A = { { 1, 2, 3 },
                   { 4, 5, 6 } }; // 2 x 3
 
 Tensor<int> A_flatten = A.flatten();
-// { 1, 2, 3, 4, 5, 6 }
+// [ 1, 2, 3, 4, 5, 6 ]
+
+Tensor<> B_3d = { { { -1, -2, -3 },
+                    {-4, -5, -6 } }, 
+                  { { 1, 2, 3 },
+                    { 4, 5, 6 } } }; // 2 x 2 x 3
+
+Tensor<> B_flatten_12 = B_3d.flatten(0, 1) // flatten the first and second dimension
+/* 
+[
+  [-1, -2, -3],
+  [-4, -5, -6],
+  [1, 2, 3],
+  [4, 5, 6]
+]
+*/
+
+Tensor<> B_flatten_23 = B_3d.flatten(1, 2) // flatten the second and the third (last) dimension
+/*
+[
+  [-1, -2, -3, -4, -5, -6],
+  [1, 2, 3, 4, 5, 6]
+]
+*/
 ```
diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index 00f201f..2c336f1 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -2,7 +2,7 @@
 #include "tensor_utils.hpp"
 using namespace std;
 
-template <typename T = double>
+template <typename T = float>
 class Tensor
 {
 private:
@@ -312,35 +312,6 @@ class Tensor
         this->compute_contiguous_strides();
     }
 
-    // // Recursive helper to process nested initializer lists
-    // template<typename U>
-    // void flatten_list(const std::initializer_list<U>& list, size_t depth = 0) {
-    //     // Handle the current dimension
-    //     if (depth == shapes_.size()) {
-    //         // First encounter with this depth: record size
-    //         shapes_.push_back(list.size());
-    //     } else {
-    //         // Verify size matches the existing dimension
-    //         if (list.size() != shapes_[depth]) {
-    //             throw std::invalid_argument("Inconsistent shape at depth " + std::to_string(depth));
-    //         }
-    //     }
-
-    //     // Recurse or add data
-    //     if constexpr (is_list<U>::value) {
-    //         // Process nested lists
-    //         for (const auto& elem : list) {
-    //             flatten_list(elem, depth + 1);
-    //         }
-    //     } else {
-    //         // Ensure element type matches Tensor type
-    //         // static_assert(std::is_same_v<U, T>, "Element type must match Tensor type");
-    //         for (const auto& elem : list) {
-    //             data_.push_back(static_cast<T>(elem));
-    //         }
-    //     }
-    // }
-
     // Scaler constructor
     Tensor(const T &value)
     {
@@ -432,10 +403,27 @@ class Tensor
     }
 
     // copy constructor
+    // Direct initialization with member initializer lists is more efficient than first default-constructing members and then assigning values.
     Tensor(const Tensor<T> &other)
+        : data_(make_shared<vector<T>>(*(other.data_))),
+          shape_(other.shape_),
+          strides_(other.strides_),
+          offset_(other.offset_),
+          size_(other.size_)
+    {
+    }
+
+    // move constructor
+    Tensor(Tensor<T> &&other) noexcept
+        : data_(std::move(other.data_)),
+          shape_(std::move(other.shape_)),
+          strides_(std::move(other.strides_)),
+          offset_(other.offset_),
+          size_(other.size_)
     {
-        // already overload the = operator
-        *this = other;
+        // Reset other to a valid but empty state
+        other.offset_ = 0;
+        other.size_ = -1;
     }
 
     // template <typename V>
@@ -724,7 +712,7 @@ class Tensor
      * @throws std::out_of_range if start_dim or end_dim is out of the range of the tensor's dimensions.
      */
 
-    Tensor<> flatten(int64_t start_dim = 0, int64_t end_dim = -1) const
+    Tensor<T> flatten(int64_t start_dim = 0, int64_t end_dim = -1) const
     {
         if (start_dim < 0)
         {
@@ -898,10 +886,10 @@ class Tensor
     }
 
     /// @brief Convert the tensor to a tensor of a different type.
-    /// @details If U is not provided, it defaults to double.
+    /// @details If U is not provided, it defaults to float.
     /// @param U the type to convert to
     /// @return a tensor with the same shape and data, but with the type U
-    template <typename U = double>
+    template <typename U = float>
     Tensor<U> dtype() const
     {
         return dtype_impl<T, U>(*this);
@@ -1096,6 +1084,27 @@ class Tensor
         return *this;
     }
 
+    // move assignment operator
+    Tensor<T> &operator=(Tensor<T> &&other) noexcept
+    {
+        if (this != &other) {
+            // Move resources from other to this
+            this->data_ = std::move(other.data_);
+            this->shape_ = std::move(other.shape_);
+            this->strides_ = std::move(other.strides_);
+            this->offset_ = other.offset_;
+            this->size_ = other.size_;
+            
+            // Reset other to a valid but empty state
+            other.shape_.clear();
+            other.strides_.clear();
+            other.offset_ = 0;
+            other.size_ = -1;
+            // Note: other.data_ remains a valid empty shared_ptr after std::move
+        }
+        return *this;
+    }
+
     const Tensor<T> operator+=(const Tensor<T> &other)
     {
         *this = *this + other;

From 1fb7f21670615120773e186e1a35da0bed18e5b6 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 22:44:50 +0800
Subject: [PATCH 09/17] refractor: change the default type from double to float

---
 examples/test_conv2d.cpp                 |  4 +-
 include/core/loss.hpp                    |  4 +-
 include/datasets/mnist.hpp               | 10 ++---
 include/metrics/accuracy.hpp             |  2 +-
 include/models/cnn.hpp                   | 13 +++++-
 include/modules/activations/softmax.hpp  |  2 +-
 include/modules/layers/dropout.hpp       |  6 +--
 include/modules/layers/max_pool.hpp      |  3 +-
 include/modules/losses/cross_entropy.hpp |  2 +-
 include/modules/losses/mse.hpp           |  2 +-
 include/utils/conv2d_utils.hpp           | 23 +++++++++-
 src/datasets/mnist.cpp                   |  8 ++--
 src/metrics/accuracy.cpp                 |  4 +-
 src/models/cnn.cpp                       |  9 ++++
 src/modules/activations/softmax.cpp      | 14 +++----
 src/modules/layers/conv2d.cpp            | 10 +++--
 src/modules/layers/dropout.cpp           |  2 +-
 src/modules/layers/flatten.cpp           |  6 ---
 src/modules/layers/max_pool.cpp          | 53 ++++++++++++++++++++++++
 src/modules/losses/cross_entropy.cpp     |  8 ++--
 src/modules/losses/mse.cpp               |  6 +--
 src/utils/conv2d_utils.cpp               |  2 +-
 tests/core/tensor_test.cpp               | 34 +++++++--------
 23 files changed, 157 insertions(+), 70 deletions(-)

diff --git a/examples/test_conv2d.cpp b/examples/test_conv2d.cpp
index f9bbd43..793d870 100644
--- a/examples/test_conv2d.cpp
+++ b/examples/test_conv2d.cpp
@@ -32,7 +32,7 @@ int main()
     Tensor<> test_weight = Tensor<>({out_channels, in_channels, weight_size, weight_size}, 0.0f);
     Tensor<> test_bias = Tensor<>({out_channels}, 0.0f);
 
-    double val = 0.01;
+    float val = 0.01;
     for (size_t i = 0; i < out_channels; i++)
     {
         for (size_t j = 0; j < in_channels; j++)
@@ -209,7 +209,7 @@ int main()
 
     output_3 /= 1e6;
 
-    double loss = cross_entropy(output_3, labels);
+    float loss = cross_entropy(output_3, labels);
 
     cout << "Loss: " << loss << endl;
 
diff --git a/include/core/loss.hpp b/include/core/loss.hpp
index 4f3eb14..0bba04a 100644
--- a/include/core/loss.hpp
+++ b/include/core/loss.hpp
@@ -6,9 +6,9 @@ class Loss {
     public:
         Loss() = default;
         virtual ~Loss() = default;
-        virtual double forward(const Tensor<>& Y_hat, const Tensor<>& Y) = 0;
+        virtual float forward(const Tensor<>& Y_hat, const Tensor<>& Y) = 0;
         virtual Tensor<> backward() = 0;
-        inline double operator()(const Tensor<>& Y_hat, const Tensor<>& Y) { return this->forward(Y_hat, Y); }
+        inline float operator()(const Tensor<>& Y_hat, const Tensor<>& Y) { return this->forward(Y_hat, Y); }
 
 
     protected:
diff --git a/include/datasets/mnist.hpp b/include/datasets/mnist.hpp
index 9d4f0d9..66c85b8 100644
--- a/include/datasets/mnist.hpp
+++ b/include/datasets/mnist.hpp
@@ -10,7 +10,7 @@
 using namespace std;
 
 struct Batch {
-    vector<vector<double>> batch_data;
+    vector<vector<float>> batch_data;
     vector<int> batch_labels;
 
     tuple<Tensor<>, Tensor<>> to_tensor();
@@ -18,11 +18,11 @@ struct Batch {
 
 class MNIST {
 private:
-    const double MNIST_MEAN = 0.1307f;
-    const double MNIST_STD = 0.3081f;
+    const float MNIST_MEAN = 0.1307f;
+    const float MNIST_STD = 0.3081f;
     const int MNIST_NUM_LABELS = 10;
 
-    vector<vector<double>> images;
+    vector<vector<float>> images;
     vector<int> labels;
 
     size_t current_batch_idxs = 0;
@@ -33,7 +33,7 @@ class MNIST {
 
     template<typename T>
     T reverse_int(T value);
-    double normalize(double value);
+    float normalize(float value);
 
     bool read_images(const string& path);
     bool read_labels(const string& path);
diff --git a/include/metrics/accuracy.hpp b/include/metrics/accuracy.hpp
index 329c970..e1898c6 100644
--- a/include/metrics/accuracy.hpp
+++ b/include/metrics/accuracy.hpp
@@ -2,5 +2,5 @@
 #include "tensor.hpp"
 
 namespace metrics {
-    double accuracy(const Tensor<>& output, const Tensor<>& target);
+    float accuracy(const Tensor<>& output, const Tensor<>& target);
 }
\ No newline at end of file
diff --git a/include/models/cnn.hpp b/include/models/cnn.hpp
index c9f955a..ead5170 100644
--- a/include/models/cnn.hpp
+++ b/include/models/cnn.hpp
@@ -5,14 +5,23 @@ using namespace nn;
 class CNN : public Module
 {
 public:
-    CNN(vector<size_t> layer_sizes, double dropout_p = 0.5);
-    CNN(initializer_list<size_t> layer_sizes, double dropout_p = 0.5);
+    CNN(vector<size_t> layer_sizes, float dropout_p = 0.5);
+    CNN(initializer_list<size_t> layer_sizes, float dropout_p = 0.5);
     ~CNN();
 
     virtual Tensor<> forward(const Tensor<> &input) override;
     virtual Tensor<> backward(const Tensor<> &grad_output) override;
     virtual void update_params(const float lr) override;
 
+protected:
+    /**
+     * Applies a function to all child modules.
+     * This implementation iterates through all layers and applies the function to each.
+     * 
+     * @param fn A function that takes a Module reference and returns void.
+     */
+    virtual void apply_to_children(const function<void(Module&)>& fn) override;
+
 private:
     vector<Module *> layers_;
     int num_layers_;
diff --git a/include/modules/activations/softmax.hpp b/include/modules/activations/softmax.hpp
index 9447211..5eee0dd 100644
--- a/include/modules/activations/softmax.hpp
+++ b/include/modules/activations/softmax.hpp
@@ -8,7 +8,7 @@ class Softmax : public Module {
 
         // Helper function to deal with multiple dimensions
         Tensor<> softmax_helper(const Tensor<>& input);
-        vector<double> softmax_helper(const vector<double>& input);
+        vector<float> softmax_helper(const vector<float>& input);
     public:
         Softmax();
         Tensor<> forward(const Tensor<>& input);
diff --git a/include/modules/layers/dropout.hpp b/include/modules/layers/dropout.hpp
index b465726..2363e78 100644
--- a/include/modules/layers/dropout.hpp
+++ b/include/modules/layers/dropout.hpp
@@ -5,12 +5,12 @@
 namespace nn {
     class Dropout : public Module {
         public:
-            Dropout(double p = 0.5);
+            Dropout(float p = 0.5);
             virtual Tensor<> forward(const Tensor<>& input) override;
             virtual Tensor<> backward(const Tensor<>& grad_output) override;
         private:
-            double p_;
-            double scale_;
+            float p_;
+            float scale_;
             Tensor<> mask_cache_;
             // probability distribution of the dropout
             bernoulli_distribution pmf_;
diff --git a/include/modules/layers/max_pool.hpp b/include/modules/layers/max_pool.hpp
index 93ad1e1..45fd05b 100644
--- a/include/modules/layers/max_pool.hpp
+++ b/include/modules/layers/max_pool.hpp
@@ -7,11 +7,10 @@ namespace nn
     class MaxPool2d : public Module
     {
     public:
-        MaxPool2d(var_pair kernel_size, var_pair stride, var_pair padding, const string &padding_mode = "zeros");
+        MaxPool2d(var_pair kernel_size, var_pair stride = (size_t)1, var_pair padding = (size_t)0, const string &padding_mode = "zeros");
 
         virtual Tensor<> forward(const Tensor<> &input) override;
         virtual Tensor<> backward(const Tensor<> &grad_output) override;
-        virtual void update_params(const float lr) override;
 
     private:
         size_tp2 kernel_size_;
diff --git a/include/modules/losses/cross_entropy.hpp b/include/modules/losses/cross_entropy.hpp
index d8871b4..7329e94 100644
--- a/include/modules/losses/cross_entropy.hpp
+++ b/include/modules/losses/cross_entropy.hpp
@@ -7,7 +7,7 @@ namespace nn {
 class CrossEntropyLoss : public Loss {
     public:
         CrossEntropyLoss();
-        virtual double forward(const Tensor<>& Y_hat, const Tensor<>& Y) override;
+        virtual float forward(const Tensor<>& Y_hat, const Tensor<>& Y) override;
         virtual Tensor<> backward() override;
 
     private:
diff --git a/include/modules/losses/mse.hpp b/include/modules/losses/mse.hpp
index 41ccd3d..52ece1c 100644
--- a/include/modules/losses/mse.hpp
+++ b/include/modules/losses/mse.hpp
@@ -6,7 +6,7 @@ namespace nn {
 class MSE : public Loss{
     public:
         MSE();
-        virtual double forward(const Tensor<>& Y_hat, const Tensor<>& Y) override;
+        virtual float forward(const Tensor<>& Y_hat, const Tensor<>& Y) override;
         virtual Tensor<> backward() override;
     };
 
diff --git a/include/utils/conv2d_utils.hpp b/include/utils/conv2d_utils.hpp
index a53b9bd..73498a0 100644
--- a/include/utils/conv2d_utils.hpp
+++ b/include/utils/conv2d_utils.hpp
@@ -31,4 +31,25 @@ const vector<size_t> calculate_output_shape(const vector<size_t> &input_shape, c
 
 Tensor<> flip_vertical_and_horizontal(const Tensor<> &input);
 
-Tensor<> dilate_input(const Tensor<> &input, const size_tp2 &dilation);
\ No newline at end of file
+Tensor<> dilate_input(const Tensor<> &input, const size_tp2 &dilation);
+
+// // Helper lambda to process variant parameters
+// template <typename U>
+// size_tp2 process_variant(U &&arg)
+// {
+//     using T = std::decay_t<U>;
+
+//     if constexpr (std::is_same_v<T, size_t>)
+//     {
+//         if (arg < 0)
+//         {
+//             throw std::invalid_argument("Negative kernel size, stride, padding, or dilation is not supported");
+//         }
+//         return {arg, arg};
+//     }
+//     else
+//     {
+//         static_assert(std::is_same_v<T, size_tp2>, "Unexpected type in variant");
+//         return arg;
+//     }
+// }
\ No newline at end of file
diff --git a/src/datasets/mnist.cpp b/src/datasets/mnist.cpp
index bb8b178..fd28cc4 100644
--- a/src/datasets/mnist.cpp
+++ b/src/datasets/mnist.cpp
@@ -49,8 +49,8 @@ T MNIST::reverse_int(T value) {
     return result;
 }
 
-double MNIST::normalize(double value) {
-    double scaled = value / 255.0f;
+float MNIST::normalize(float value) {
+    float scaled = value / 255.0f;
     return (scaled - this->MNIST_MEAN) / this->MNIST_STD;
 }
 
@@ -73,13 +73,13 @@ bool MNIST::read_images(const string& path) {
     numRows = reverse_int(numRows);
     numCols = reverse_int(numCols);
 
-    this->images.resize(numImages, vector<double>(numRows * numCols));
+    this->images.resize(numImages, vector<float>(numRows * numCols));
 
     for(int i = 0; i < numImages; i++) {
         for(int j = 0; j < numRows * numCols; j++) {
             unsigned char temp = 0;
             file.read(reinterpret_cast<char*>(&temp), sizeof(temp));
-            this->images[i][j] = normalize(static_cast<double>(temp));
+            this->images[i][j] = normalize(static_cast<float>(temp));
         }
     }
 
diff --git a/src/metrics/accuracy.cpp b/src/metrics/accuracy.cpp
index 9a7c98c..96080f8 100644
--- a/src/metrics/accuracy.cpp
+++ b/src/metrics/accuracy.cpp
@@ -1,6 +1,6 @@
 #include "accuracy.hpp"
 
-double metrics::accuracy(const Tensor<>& output, const Tensor<>& target) {
+float metrics::accuracy(const Tensor<>& output, const Tensor<>& target) {
     Tensor<size_t> output_argmax = output.argmax();
 
     Tensor<size_t> target_argmax;
@@ -18,5 +18,5 @@ double metrics::accuracy(const Tensor<>& output, const Tensor<>& target) {
 
     Tensor<int> result = output_argmax.equal(target_argmax);
 
-    return (double)result.sum() / (double)result.shapes()[0];
+    return (float)result.sum() / (float)result.shapes()[0];
 }
\ No newline at end of file
diff --git a/src/models/cnn.cpp b/src/models/cnn.cpp
index e69de29..0433f0f 100644
--- a/src/models/cnn.cpp
+++ b/src/models/cnn.cpp
@@ -0,0 +1,9 @@
+#include "cnn.hpp"
+
+void CNN::apply_to_children(const function<void(Module&)>& fn)
+{
+    for (Module* layer : this->layers_)
+    {
+        fn(*layer);
+    }
+}
diff --git a/src/modules/activations/softmax.cpp b/src/modules/activations/softmax.cpp
index d9bf626..752577c 100644
--- a/src/modules/activations/softmax.cpp
+++ b/src/modules/activations/softmax.cpp
@@ -10,17 +10,17 @@ Softmax::Softmax()
 
 Tensor<> Softmax::softmax_helper(const Tensor<> &input)
 {
-    Tensor<> result = input.map([](double x)
+    Tensor<> result = input.map([](float x)
                                 { return exp(x); });
-    double sum = result.sum();
+    float sum = result.sum();
 
     return result / sum;
 }
 
-vector<double> Softmax::softmax_helper(const vector<double> &input)
+vector<float> Softmax::softmax_helper(const vector<float> &input)
 {
-    double sum = 0.0f;
-    vector<double> result;
+    float sum = 0.0f;
+    vector<float> result;
 
     for (size_t i = 0; i < input.size(); i++)
     {
@@ -52,11 +52,11 @@ Tensor<> Softmax::forward(const Tensor<> &input)
     // const size_t n = input.shapes()[leading_ndim];
     // const size_t m = input.shapes()[leading_ndim + 1];
 
-    vector<vector<double>> softmax_input;
+    vector<vector<float>> softmax_input;
 
     for (size_t i = 0; i < input.shapes()[0]; i++)
     {
-        vector<double> input_row;
+        vector<float> input_row;
         input_row.reserve(input.shapes()[1]);
 
         for (size_t j = 0; j < input.shapes()[1]; j++)
diff --git a/src/modules/layers/conv2d.cpp b/src/modules/layers/conv2d.cpp
index 5e7a87d..99a11d8 100644
--- a/src/modules/layers/conv2d.cpp
+++ b/src/modules/layers/conv2d.cpp
@@ -93,6 +93,9 @@ Tensor<> Conv2d::forward(const Tensor<> &input)
 
 Tensor<> Conv2d::backward(const Tensor<> &grad_output)
 {
+    /*
+    
+    */
     // dL_dY = grad_output
 
     // dL_dW = conv(input_data, dL_dY)
@@ -140,8 +143,7 @@ Tensor<> Conv2d::backward(const Tensor<> &grad_output)
     flipped_weight.print();
     cout << endl;
 
-    
-    Tensor<> permuted_flipped_weight = flipped_weight.permute(1, 0, 2, 3);
+        Tensor<> permuted_flipped_weight = flipped_weight.permute(1, 0, 2, 3);
 
     cout << "permuted_flipped_weight: " << endl;
     permuted_flipped_weight.print();
@@ -204,12 +206,12 @@ void Conv2d::reset_parameters()
     size_t n = this->in_channels_;
     n *= this->kernel_size_.first * this->kernel_size_.second;
 
-    const double stdv = 1.0 / sqrt(n);
+    const float stdv = 1.0 / sqrt(n);
 
     // Set up the random number generator
     random_device rd;
     mt19937 gen(rd());
-    uniform_real_distribution<double> dis(-stdv, stdv);
+    uniform_real_distribution<float> dis(-stdv, stdv);
 
     for (size_t i = 0; i < this->out_channels_; i++)
     {
diff --git a/src/modules/layers/dropout.cpp b/src/modules/layers/dropout.cpp
index 2fadaa8..6809db1 100644
--- a/src/modules/layers/dropout.cpp
+++ b/src/modules/layers/dropout.cpp
@@ -1,7 +1,7 @@
 #include "dropout.hpp"
 using namespace nn;
 
-Dropout::Dropout(double p) {
+Dropout::Dropout(float p) {
     if (p < 0 || p > 1) {
         throw runtime_error("Dropout probability must be between 0 and 1");
     }
diff --git a/src/modules/layers/flatten.cpp b/src/modules/layers/flatten.cpp
index 2c06798..96f245f 100644
--- a/src/modules/layers/flatten.cpp
+++ b/src/modules/layers/flatten.cpp
@@ -17,9 +17,3 @@ Tensor<> Flatten::backward(const Tensor<> &grad_output)
 {
     return grad_output.reshape(this->original_input_shape_);
 }
-
-void Flatten::update_params(const float lr)
-{
-    //  we don't need to update any parameters
-    return;
-}
diff --git a/src/modules/layers/max_pool.cpp b/src/modules/layers/max_pool.cpp
index e69de29..72a6fab 100644
--- a/src/modules/layers/max_pool.cpp
+++ b/src/modules/layers/max_pool.cpp
@@ -0,0 +1,53 @@
+#include "max_pool.hpp"
+using namespace nn;
+
+MaxPool2d::MaxPool2d(var_pair kernel_size, var_pair stride, var_pair padding, const string &padding_mode)
+{
+    // Helper lambda to process variant parameters
+    auto process_variant = [](auto &&arg) -> size_tp2
+    {
+        using T = std::decay_t<decltype(arg)>;
+        if constexpr (std::is_same_v<T, size_t>)
+        {
+            if (arg < 0)
+            {
+                throw std::invalid_argument("Negative kernel size, stride, padding, or dilation is not supported");
+            }
+            return {arg, arg};
+        }
+        else
+        {
+            static_assert(std::is_same_v<T, size_tp2>, "Unexpected type in variant");
+            return arg;
+        }
+    };
+
+    // Set kernel size, stride, and padding
+    this->kernel_size_ = std::visit(process_variant, kernel_size);
+    this->stride_ = std::visit(process_variant, stride);
+    this->padding_ = std::visit(process_variant, padding);
+
+    // Check if padding mode is valid
+    unordered_map<string, PaddingMode> all_padding_modes = {{"zeros", PaddingMode::ZEROS}, {"reflect", PaddingMode::REFLECT}, {"replicate", PaddingMode::REPLICATE}};
+
+    if (all_padding_modes.find(padding_mode) == all_padding_modes.end())
+    {
+        throw std::invalid_argument("Padding mode must be one of 'zeros', 'reflect', or 'replicate'");
+    }
+
+    // Set padding mode
+    this->padding_mode_ = all_padding_modes[padding_mode];
+    this->padding_module_ = Padding(this->padding_, this->padding_mode_);
+}
+
+Tensor<> MaxPool2d::forward(const Tensor<> &input)
+{
+    this->original_input_shape_ = input.shapes();
+
+    // return output;
+}
+
+Tensor<> MaxPool2d::backward(const Tensor<> &grad_output)
+{
+    // return grad_input;
+}
\ No newline at end of file
diff --git a/src/modules/losses/cross_entropy.cpp b/src/modules/losses/cross_entropy.cpp
index c028c89..31e3ce4 100644
--- a/src/modules/losses/cross_entropy.cpp
+++ b/src/modules/losses/cross_entropy.cpp
@@ -9,7 +9,7 @@ CrossEntropyLoss::CrossEntropyLoss()
     cout << "CrossEntropyLoss initialized" << endl;
 }
 
-double CrossEntropyLoss::forward(const Tensor<> &Y_hat, const Tensor<> &Y)
+float CrossEntropyLoss::forward(const Tensor<> &Y_hat, const Tensor<> &Y)
 {
     /*
     L = 1 / B \sum_{i=1}^B \sum_{j=1}^M Y_{ij} * log(softmax(Y_hat_{ij}))
@@ -24,7 +24,7 @@ double CrossEntropyLoss::forward(const Tensor<> &Y_hat, const Tensor<> &Y)
     if (Y.ndim() == 2)
     {
         // In this case, we assume Y is a matrix of one-hot vectors. So we can just store the index of the correct label
-        this->Y_cache_ = Y.argmax().dtype<double>();
+        this->Y_cache_ = Y.argmax().dtype<float>();
     }
     else if (Y.ndim() == 1)
     {
@@ -37,14 +37,14 @@ double CrossEntropyLoss::forward(const Tensor<> &Y_hat, const Tensor<> &Y)
 
     // B = batch size
     const size_t B = this->Y_cache_.shapes()[0];
-    const double factor = -1.0f / B;
+    const float factor = -1.0f / B;
 
     // apply softmax to model output
     Tensor<> softmax_Y_hat = this->softmax_(Y_hat);
     this->softmax_Y_hat_cache_ = softmax_Y_hat;
 
     // sum up all the elements
-    double loss_without_factor = 0.0f;
+    float loss_without_factor = 0.0f;
 
     for (int i = 0; i < B; ++i)
     {
diff --git a/src/modules/losses/mse.cpp b/src/modules/losses/mse.cpp
index 1a9a497..7dfaef8 100644
--- a/src/modules/losses/mse.cpp
+++ b/src/modules/losses/mse.cpp
@@ -3,7 +3,7 @@ using namespace nn;
 
 MSE::MSE() {}
 
-double MSE::forward(const Tensor<>& Y_hat, const Tensor<>& Y) {
+float MSE::forward(const Tensor<>& Y_hat, const Tensor<>& Y) {
     // Y R^B x M, Y_hat R^B x M , B is the batch size and M is the output dimension
 
     // 1 / (B * M) * ||(Y - Y_hat)||^2
@@ -19,7 +19,7 @@ double MSE::forward(const Tensor<>& Y_hat, const Tensor<>& Y) {
     Tensor<> diff = Y - Y_hat;
     diff *= diff;
 
-    double loss_without_factor = diff.sum();
+    float loss_without_factor = diff.sum();
 
     return loss_without_factor / (B * M);
 }
@@ -30,7 +30,7 @@ Tensor<> MSE::backward() {
     // 2 / (B * M) * (Y - Y_hat)
 
     const size_t B = this->Y_cache_.shapes()[0], M = this->Y_cache_.shapes()[1];
-    const double factor = 2.0f / (B * M);
+    const float factor = 2.0f / (B * M);
 
     Tensor<> diff = this->Y_cache_ - this->Y_hat_cache_;
 
diff --git a/src/utils/conv2d_utils.cpp b/src/utils/conv2d_utils.cpp
index 457e971..492157c 100644
--- a/src/utils/conv2d_utils.cpp
+++ b/src/utils/conv2d_utils.cpp
@@ -196,7 +196,7 @@ Tensor<> flip_vertical_and_horizontal(const Tensor<> &input)
     const size_t H = input.shapes()[2];
     const size_t W = input.shapes()[3];
 
-    double cache;
+    float cache;
 
     for (size_t b = 0; b < B; ++b)
     {
diff --git a/tests/core/tensor_test.cpp b/tests/core/tensor_test.cpp
index 9fb0c31..5afdc62 100644
--- a/tests/core/tensor_test.cpp
+++ b/tests/core/tensor_test.cpp
@@ -94,7 +94,7 @@ TEST_CASE("TensorTest - 3D Tensor Constructor from initializer_list")
 
 TEST_CASE("TensorTest - 1D Tensor Constructor from vector")
 {
-    vector<double> data = {1.0f, 2.0f, 3.0f, 4.0f};
+    vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
     Tensor<> tensor1 = data;
     CHECK(tensor1.ndim() == 1);
     CHECK(tensor1.size() == 4);
@@ -104,7 +104,7 @@ TEST_CASE("TensorTest - 1D Tensor Constructor from vector")
     CHECK(tensor1[2] == 3.0f);
     CHECK(tensor1[3] == 4.0f);
 
-    vector<double> data2 = {0};
+    vector<float> data2 = {0};
     Tensor<> tensor2 = data2;
     CHECK(tensor2.shapes()[0] == 1);
     CHECK(tensor2.ndim() == 1);
@@ -114,7 +114,7 @@ TEST_CASE("TensorTest - 1D Tensor Constructor from vector")
 
 TEST_CASE("TensorTest - 2D Tensor Constructor from vector")
 {
-    vector<vector<double>> data = {{1.0f, 2.0f}, {3.0f, 4.0f}};
+    vector<vector<float>> data = {{1.0f, 2.0f}, {3.0f, 4.0f}};
     Tensor<> tensor = data;
     CHECK(tensor.ndim() == 2);
     CHECK(tensor.size() == 4);
@@ -125,7 +125,7 @@ TEST_CASE("TensorTest - 2D Tensor Constructor from vector")
     CHECK(tensor[1, 0] == 3.0f);
     CHECK(tensor[1, 1] == 4.0f);
 
-    vector<vector<double>> data2 = {{0.0f, 0.0f}};
+    vector<vector<float>> data2 = {{0.0f, 0.0f}};
     Tensor<> tensor2 = data2;
     CHECK(tensor2.ndim() == 2);
     CHECK(tensor2.size() == 2);
@@ -137,7 +137,7 @@ TEST_CASE("TensorTest - 2D Tensor Constructor from vector")
 
 TEST_CASE("TensorTest - 3D Tensor Constructor from vector")
 {
-    vector<vector<vector<double>>> data = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
+    vector<vector<vector<float>>> data = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
     Tensor<> tensor = data;
     CHECK(tensor.ndim() == 3);
     CHECK(tensor.size() == 8);
@@ -150,7 +150,7 @@ TEST_CASE("TensorTest - 3D Tensor Constructor from vector")
     CHECK(tensor[0, 1, 1] == 4.0f);
     CHECK(tensor[1, 1, 1] == 8.0f);
 
-    vector<vector<vector<double>>> data2 = {{{0.0f, 0.0f}, {0.0f, 0.0f}}, {{0.0f, 0.0f}, {0.0f, 0.0f}}};
+    vector<vector<vector<float>>> data2 = {{{0.0f, 0.0f}, {0.0f, 0.0f}}, {{0.0f, 0.0f}, {0.0f, 0.0f}}};
     Tensor<> tensor2 = data2;
     CHECK(tensor2.ndim() == 3);
     CHECK(tensor2.size() == 8);
@@ -166,7 +166,7 @@ TEST_CASE("TensorTest - 3D Tensor Constructor from vector")
 
 TEST_CASE("TensorTest - 4D Tensor Constructor from vector")
 {
-    vector<vector<vector<vector<double>>>> data = {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}, {{{9.0f, 10.0f}, {11.0f, 12.0f}}, {{13.0f, 14.0f}, {15.0f, 16.0f}}}};
+    vector<vector<vector<vector<float>>>> data = {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}, {{{9.0f, 10.0f}, {11.0f, 12.0f}}, {{13.0f, 14.0f}, {15.0f, 16.0f}}}};
     Tensor<> tensor = data;
     CHECK(tensor.ndim() == 4);
     CHECK(tensor.size() == 16);
@@ -493,22 +493,22 @@ TEST_CASE("TensorTest - abs")
 TEST_CASE("TensorTest - sum")
 {
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f};
-    double sum_1d = tensor_1d.sum();
+    float sum_1d = tensor_1d.sum();
     CHECK(sum_1d == 10.0f);
 
     Tensor<> tensor_2d = {{1.0f, 2.0f}, {3.0f, 4.0f}};
-    double sum_2d = tensor_2d.sum();
+    float sum_2d = tensor_2d.sum();
     CHECK(sum_2d == 10.0f);
 
     Tensor<> tensor_3d = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
-    double sum_3d = tensor_3d.sum();
+    float sum_3d = tensor_3d.sum();
     CHECK(sum_3d == 36.0f);
 }
 
 TEST_CASE("TensorTest - filter")
 {
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f};
-    Tensor<> filtered_tensor_1d = tensor_1d.filter([](double x)
+    Tensor<> filtered_tensor_1d = tensor_1d.filter([](float x)
                                                    { return x < 3.0f; });
     CHECK(filtered_tensor_1d.ndim() == 1);
     CHECK(filtered_tensor_1d.size() == 4);
@@ -519,7 +519,7 @@ TEST_CASE("TensorTest - filter")
     CHECK(filtered_tensor_1d[3] == 0.0f);
 
     Tensor<> tensor_2d = {{1.0f, 2.0f}, {3.0f, 4.0f}};
-    Tensor<> filtered_tensor_2d = tensor_2d.filter([](double x)
+    Tensor<> filtered_tensor_2d = tensor_2d.filter([](float x)
                                                    { return x < 3.0f; });
     CHECK(filtered_tensor_2d.ndim() == 2);
     CHECK(filtered_tensor_2d.size() == 4);
@@ -531,7 +531,7 @@ TEST_CASE("TensorTest - filter")
     CHECK(filtered_tensor_2d[1, 1] == 0.0f);
 
     Tensor<> tensor_3d = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
-    Tensor<> filtered_tensor_3d = tensor_3d.filter([](double x)
+    Tensor<> filtered_tensor_3d = tensor_3d.filter([](float x)
                                                    { return x < 3.0f; });
     CHECK(filtered_tensor_3d.ndim() == 3);
     CHECK(filtered_tensor_3d.size() == 8);
@@ -550,10 +550,10 @@ TEST_CASE("TensorTest - filter")
 
 TEST_CASE("TensorTest - map")
 {
-    double eps = 1e-5f;
+    float eps = 1e-5f;
 
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f};
-    Tensor<> tensor_1d_exp = tensor_1d.map([](double x)
+    Tensor<> tensor_1d_exp = tensor_1d.map([](float x)
                                            { return exp(x); });
     CHECK(tensor_1d_exp.ndim() == 1);
     CHECK(tensor_1d_exp.size() == 4);
@@ -564,7 +564,7 @@ TEST_CASE("TensorTest - map")
     CHECK(tensor_1d_exp[3] - exp(4.0f) < eps);
 
     Tensor<> tensor_2d = {{1.0f, 2.0f}, {3.0f, 4.0f}};
-    Tensor<> tensor_2d_times_10 = tensor_2d.map([](double x)
+    Tensor<> tensor_2d_times_10 = tensor_2d.map([](float x)
                                                 { return x * 10.0f; });
     CHECK(tensor_2d_times_10.ndim() == 2);
     CHECK(tensor_2d_times_10.size() == 4);
@@ -576,7 +576,7 @@ TEST_CASE("TensorTest - map")
     CHECK(tensor_2d_times_10[1, 1] == 40.0f);
 
     Tensor<> tensor_3d = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
-    Tensor<> tensor_3d_log = tensor_3d.map([](double x)
+    Tensor<> tensor_3d_log = tensor_3d.map([](float x)
                                            { return log(x); });
     CHECK(tensor_3d_log.ndim() == 3);
     CHECK(tensor_3d_log.size() == 8);

From a9b74fb857e0b0b3763a6e6d65801bd6895a5540 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 22:46:35 +0800
Subject: [PATCH 10/17] feat: fix the problem of train() and eval() that cannot
 propagate the state to the module component when a class has a vector of
 Module*

---
 include/core/module.hpp | 57 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/include/core/module.hpp b/include/core/module.hpp
index ea7b266..2e6c4ea 100644
--- a/include/core/module.hpp
+++ b/include/core/module.hpp
@@ -32,10 +32,21 @@ class Module {
 
         /**
          * Virtual function to update the parameters of the module.
+         * Some modules do not have parameters to update (e.g., ReLU, Dropout), so this function can be empty.
          * @param lr The learning rate for the update.
          */
         virtual void update_params(const float lr) { return; };
 
+        /**
+         * Virtual function to create a deep copy of the module.
+         * This should be overridden by derived classes to create a proper deep copy.
+         * 
+         * @return A pointer to a new Module instance that is a deep copy of this module.
+         */
+        virtual Module* clone() const {
+            throw std::runtime_error("clone() method not implemented for this module type");
+        }
+
         /**
          * Operator overload to enable calling the module like a function.
          * @param input The input data as a 2D Tensor.
@@ -45,8 +56,39 @@ class Module {
             return this->forward(input);
         }
 
-        inline void train(const bool mode = true) { this->training = mode; }
-        inline void eval() { this->training = false; }
+        /**
+         * Sets the module in training mode (train=true) or evaluation mode (train=false).
+         * This affects certain modules like Dropout and BatchNorm whose behavior differs
+         * between training and evaluation.
+         * 
+         * @param mode If true, sets the module to training mode, otherwise to evaluation mode.
+         * @return A reference to this module for method chaining.
+         */
+        virtual Module& train(const bool mode = true) { 
+            this->training = mode; 
+            apply_to_children([mode](Module& child) { child.train(mode); });
+            return *this;
+        }
+
+        /**
+         * Sets the module in evaluation mode. This affects certain modules like Dropout
+         * and BatchNorm whose behavior differs between training and evaluation.
+         * 
+         * This method also propagates to any child modules that might be members of this module.
+         * 
+         * @return A reference to this module for method chaining.
+         */
+        virtual Module& eval() { 
+            this->training = false;
+            apply_to_children([](Module& child) { child.eval(); });
+            return *this; 
+        }
+
+        /**
+         * Check if the module is in training mode.
+         * 
+         * @return true if the module is in training mode, false otherwise.
+         */
         inline bool is_training() const { return this->training; }
     
     protected:
@@ -55,6 +97,17 @@ class Module {
          */
         Tensor<> input_cache_;
         bool training = true;
+
+        /**
+         * Virtual method to apply a function to all child modules.
+         * Modules that contain other modules should override this method.
+         * 
+         * @param fn A function that takes a Module reference and returns void.
+         */
+        virtual void apply_to_children(const function<void(Module&)>& fn) {
+            // Default implementation does nothing
+            // Modules with children should override this
+        }
 };
 
 }
\ No newline at end of file

From 8def719a571884a8ba6f4fee3e125e49035532f0 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 22:46:51 +0800
Subject: [PATCH 11/17] feat: add sequential container for better module
 management

---
 include/modules/containers/sequential.hpp | 126 ++++++++++++++++++++++
 src/modules/containers/sequential.cpp     | 103 ++++++++++++++++++
 2 files changed, 229 insertions(+)
 create mode 100644 include/modules/containers/sequential.hpp
 create mode 100644 src/modules/containers/sequential.cpp

diff --git a/include/modules/containers/sequential.hpp b/include/modules/containers/sequential.hpp
new file mode 100644
index 0000000..4041fe6
--- /dev/null
+++ b/include/modules/containers/sequential.hpp
@@ -0,0 +1,126 @@
+#pragma once
+#include "module.hpp"
+#include <initializer_list>
+#include <vector>
+
+namespace nn {
+
+/**
+ * A sequential container for modules.
+ * Modules are added in the order they should be called during forward pass.
+ * Propagates training state to all contained modules.
+ */
+class Sequential : public Module {
+public:
+    /**
+     * Default constructor for an empty sequential container.
+     */
+    Sequential();
+
+    /**
+     * Constructor that takes a list of modules.
+     * 
+     * @param modules A list of Module pointers to add to the container.
+     */
+    Sequential(std::initializer_list<Module*> modules);
+
+    /**
+     * Constructor that takes a vector of modules.
+     * 
+     * @param modules A vector of Module pointers to add to the container.
+     */
+    Sequential(const std::vector<Module*>& modules);
+
+    /**
+     * Copy constructor is deleted to prevent unintended copying
+     * of modules which would lead to memory issues with ownership.
+     */
+    Sequential(const Sequential&) = delete;
+
+    /**
+     * Move constructor that transfers ownership of modules from
+     * one Sequential container to another.
+     */
+    Sequential(Sequential&& other) noexcept;
+
+    /**
+     * Destructor that deletes all owned modules.
+     */
+    virtual ~Sequential();
+
+    /**
+     * Passes the input tensor through all modules in sequence.
+     * 
+     * @param input The input tensor to pass through the modules.
+     * @return The output from the last module.
+     */
+    virtual Tensor<> forward(const Tensor<>& input) override;
+
+    /**
+     * Propagates the gradient backward through all modules in reverse order.
+     * 
+     * @param grad_output The gradient of the loss with respect to the output.
+     * @return The gradient with respect to the input.
+     */
+    virtual Tensor<> backward(const Tensor<>& grad_output) override;
+
+    /**
+     * Updates the parameters of all modules.
+     * 
+     * @param lr The learning rate for the update.
+     */
+    virtual void update_params(const float lr) override;
+
+    /**
+     * Adds a module to the end of the sequence.
+     * 
+     * @param module The module to add.
+     * @return A reference to this Sequential container for chaining.
+     */
+    Sequential& add(Module* module);
+
+    /**
+     * Gets the number of modules in the sequence.
+     * 
+     * @return The number of modules.
+     */
+    inline size_t size() const {
+        return this->modules_.size();
+    }
+
+    /**
+     * Gets a module at a specific index.
+     * 
+     * @param index The index of the module to get.
+     * @return The module at the specified index.
+     */
+    Module* get(size_t index) const;
+
+    /**
+     * Copy assignment operator is deleted to prevent unintended copying
+     * of modules which would lead to memory issues with ownership.
+     * 
+     * Consider using move semantics or implementing a clone method if 
+     * copying is needed.
+     */
+    Sequential& operator=(const Sequential&) = delete;
+
+    /**
+     * Move assignment operator to transfer ownership of modules from one
+     * Sequential container to another.
+     */
+    Sequential& operator=(Sequential&& other) noexcept;
+
+protected:
+    /**
+     * Applies a function to all child modules.
+     * 
+     * @param fn A function that takes a Module reference and returns void.
+     */
+    virtual void apply_to_children(const function<void(Module&)>& fn) override;
+
+private:
+    std::vector<Module*> modules_;
+};
+
+} // namespace nn 
\ No newline at end of file
diff --git a/src/modules/containers/sequential.cpp b/src/modules/containers/sequential.cpp
new file mode 100644
index 0000000..02b46c4
--- /dev/null
+++ b/src/modules/containers/sequential.cpp
@@ -0,0 +1,103 @@
+#include "sequential.hpp"
+#include <stdexcept>
+
+namespace nn {
+
+// Default constructor
+Sequential::Sequential() = default;
+
+// Constructor with initializer_list
+Sequential::Sequential(std::initializer_list<Module*> modules) {
+    for (Module* module : modules) {
+        this->modules_.push_back(module);
+    }
+}
+
+// Constructor with vector
+Sequential::Sequential(const std::vector<Module*>& modules) {
+    for (Module* module : modules) {
+        this->modules_.push_back(module);
+    }
+}
+
+// Move constructor
+Sequential::Sequential(Sequential&& other) noexcept : modules_(std::move(other.modules_)) {
+    // Clear the other container's vector after moving
+    other.modules_.clear();
+}
+
+// Destructor
+Sequential::~Sequential() {
+    for (Module* module : this->modules_) {
+        delete module;
+    }
+}
+
+// Forward pass
+Tensor<> Sequential::forward(const Tensor<>& input) {
+    Tensor<> x = input;
+    
+    for (Module* module : this->modules_) {
+        x = module->forward(x);
+    }
+    
+    return x;
+}
+
+// Backward pass
+Tensor<> Sequential::backward(const Tensor<>& grad_output) {
+    Tensor<> grad = grad_output;
+    
+    for (int i = this->modules_.size() - 1; i >= 0; i--) {
+        grad = this->modules_[i]->backward(grad);
+    }
+    
+    return grad;
+}
+
+// Update parameters
+void Sequential::update_params(const float lr) {
+    for (Module* module : this->modules_) {
+        module->update_params(lr);
+    }
+}
+
+// Add a module
+Sequential& Sequential::add(Module* module) {
+    this->modules_.push_back(module);
+    return *this;
+}
+
+// Get module at index
+Module* Sequential::get(size_t index) const {
+    if (index >= this->modules_.size()) {
+        throw std::out_of_range("Index out of range");
+    }
+    return this->modules_[index];
+}
+
+// Move assignment operator
+Sequential& Sequential::operator=(Sequential&& other) noexcept {
+    if (this != &other) {
+        // First clean up existing modules
+        for (Module* module : this->modules_) {
+            delete module;
+        }
+        
+        // Transfer ownership
+        this->modules_ = std::move(other.modules_);
+        
+        // Clear the other container's vector
+        other.modules_.clear();
+    }
+    return *this;
+}
+
+// Apply function to children
+void Sequential::apply_to_children(const function<void(Module&)>& fn) {
+    for (Module* module : this->modules_) {
+        fn(*module);
+    }
+}
+
+} // namespace nn 
\ No newline at end of file

From 9659ecb5c1159fd0f0f1e7b7b7ddc2ec53015d02 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 22:47:09 +0800
Subject: [PATCH 12/17] feat: change the implementation of MLP of using
 Sequential

---
 include/models/mlp.hpp | 12 ++++----
 src/models/mlp.cpp     | 65 ++++++++++++++++++------------------------
 2 files changed, 34 insertions(+), 43 deletions(-)

diff --git a/include/models/mlp.hpp b/include/models/mlp.hpp
index b3bbf8b..877128b 100644
--- a/include/models/mlp.hpp
+++ b/include/models/mlp.hpp
@@ -1,19 +1,21 @@
 #pragma once
 #include "module.hpp"
+#include "sequential.hpp"
 using namespace nn;
 
 
 class MLP : public Module {
     public:
-        MLP(vector<size_t> layer_sizes, double dropout_p = 0.5);
-        MLP(initializer_list<size_t> layer_sizes, double dropout_p = 0.5);
-        ~MLP();
+        MLP(size_t in_channels, initializer_list<size_t> hidden_channels, bool bias = true, float dropout = 0.0);
 
         virtual Tensor<> forward(const Tensor<>& input) override;
         virtual Tensor<> backward(const Tensor<>& grad_output) override;
         virtual void update_params(const float lr) override;
+        virtual Module& train(const bool mode = true) override;
+        virtual Module& eval() override;
+
+        inline const Sequential& get_layers() const { return this->layers_; }
 
     private:
-        vector<Module*> layers_;
-        int num_layers_;
+        Sequential layers_; // we use a sequential container to store the layers
 };
\ No newline at end of file
diff --git a/src/models/mlp.cpp b/src/models/mlp.cpp
index f1d0700..aad6493 100644
--- a/src/models/mlp.cpp
+++ b/src/models/mlp.cpp
@@ -3,61 +3,50 @@
 #include "relu.hpp"
 #include "dropout.hpp"
 
-MLP::MLP(vector<size_t> layer_sizes, double dropout_p)
+MLP::MLP(size_t in_channels, initializer_list<size_t> hidden_channels, bool bias, float dropout)
 {
-    this->num_layers_ = layer_sizes.size();
-
-    for (size_t i = 0; i < this->num_layers_ - 1; i++)
+    vector<Module *> layers;
+    size_t i = 0;
+    for (size_t hidden_channel : hidden_channels)
     {
-        this->layers_.push_back(new Linear(layer_sizes[i], layer_sizes[i + 1], true));
-        if (i < this->num_layers_ - 2)
+        layers.push_back(new Linear(in_channels, hidden_channel, bias));
+        in_channels = hidden_channel; // update the input channels for the next layer
+
+        if (i < hidden_channels.size() - 1)
         {
-            this->layers_.push_back(new ReLU());
-            this->layers_.push_back(new Dropout(dropout_p));
+            layers.push_back(new ReLU());
+            if (dropout > 0.0f) {
+                layers.push_back(new Dropout(dropout));
+            }
         }
+        i++;
     }
-}
 
-MLP::MLP(initializer_list<size_t> layer_sizes, double dropout_p) : MLP(vector<size_t>(layer_sizes), dropout_p) {}
-
-MLP::~MLP()
-{
-    for (Module *layer : this->layers_)
-    {
-        delete layer;
-    }
+    // Move the vector directly to the Sequential constructor
+    this->layers_ = Sequential(std::move(layers));
 }
 
 Tensor<> MLP::forward(const Tensor<> &input)
 {
-    Tensor<> x = input;
-
-    for (Module *layer : this->layers_)
-    {
-        x = layer->forward(x);
-    }
-
-    return x;
+    return this->layers_.forward(input);
 }
 
 Tensor<> MLP::backward(const Tensor<> &grad_output)
 {
-    Tensor<> grad = grad_output;
-
-    for (int i = this->layers_.size() - 1; i >= 0; i--)
-    {
-        grad = this->layers_[i]->backward(grad);
-    }
-
-    return grad;
+    return this->layers_.backward(grad_output);
 }
 
 void MLP::update_params(const float lr)
 {
-    for (Module *layer : this->layers_)
-    {
-        layer->update_params(lr);
-    }
+    this->layers_.update_params(lr);
+}
+
+Module& MLP::train(const bool mode) {
+    this->layers_.train(mode);
+    return *this;
+}
 
-    return;
+Module& MLP::eval() {
+    this->layers_.eval();
+    return *this;
 }
\ No newline at end of file

From 9d450131c8fda2d7364d5ee5d63499d7b7dc9204 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 22:47:26 +0800
Subject: [PATCH 13/17] feat: change the implementation of MLP of using
 Sequential

---
 src/modules/layers/linear.cpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/modules/layers/linear.cpp b/src/modules/layers/linear.cpp
index fa1ee40..cfa848b 100644
--- a/src/modules/layers/linear.cpp
+++ b/src/modules/layers/linear.cpp
@@ -51,10 +51,6 @@ Tensor<> Linear::backward(const Tensor<> &grad_output)
     // dL/dW = X^T * dL/dY
     this->grad_weight_ = this->input_cache_.transpose().matmul(grad_output);
 
-    // cout << endl << "dL/dW: " << endl;
-    // this->grad_weight_.print();
-    // cout << endl;
-
     // dL/dX = dL/dY * W^T
     Tensor<> grad_input = grad_output.matmul(this->weight_.transpose());
 
@@ -65,10 +61,6 @@ Tensor<> Linear::backward(const Tensor<> &grad_output)
     if (this->use_bias_)
         this->grad_bias_ = grad_output.transpose().matmul(Tensor<>({grad_output.shapes()[0], 1}, 1.0f));
 
-    // cout << endl << "dL/db: " << endl;
-    // this->grad_bias_.print();
-    // cout << endl;
-
     return grad_input;
 }
 
@@ -93,12 +85,12 @@ void Linear::reset_parameters()
 
     */
     // Calculate the limit for the uniform distribution
-    const double stdv = 1.0 / sqrt(this->weight_.shapes()[0]); // since the weight is transposed
+    const float stdv = 1.0 / sqrt(this->weight_.shapes()[0]); // since the weight is transposed
 
     // Set up the random number generator
     random_device rd;
     mt19937 gen(rd());
-    uniform_real_distribution<double> dis(-stdv, stdv);
+    uniform_real_distribution<float> dis(-stdv, stdv);
 
     // Xavier initialization
     for (size_t i = 0; i < this->in_features_; i++)

From 9055e179a2d0144168497f3551406f0a3df02b84 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 22:47:31 +0800
Subject: [PATCH 14/17] refractor

---
 src/core/tensor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index c17b477..af50810 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -1,3 +1,5 @@
+#include "tensor.hpp"
+
 /*
 Please refer to include/core/tensor.hpp
 */
\ No newline at end of file

From b3dbbb88a363df790cf6b3389644d698c9147a87 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 23:20:22 +0800
Subject: [PATCH 15/17] refractor: bias is now a default argument

---
 include/modules/layers/linear.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/modules/layers/linear.hpp b/include/modules/layers/linear.hpp
index 036ed50..bf41216 100644
--- a/include/modules/layers/linear.hpp
+++ b/include/modules/layers/linear.hpp
@@ -7,7 +7,7 @@ namespace nn
     class Linear : public Module
     {
     public:
-        Linear(size_t in_features, size_t out_features, bool bias);
+        Linear(size_t in_features, size_t out_features, bool bias = true);
 
         virtual Tensor<> forward(const Tensor<> &input) override;
         virtual Tensor<> backward(const Tensor<> &grad_output) override;

From a403079266ba8050def419bd0b38c7fd618ab137 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 23:20:47 +0800
Subject: [PATCH 16/17] update readme by adding sequential instruction

---
 README.md | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 580ab6e..cfde23a 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,8 @@ brew install cmake
 brew install gcc
 ```
 
+If there is any problem, please try uninstalling `cmake` and `gcc`, and reinstalling them afterward. 
+
 For **Linux**, run the following commands:
 
 ```bash
@@ -65,6 +67,41 @@ I implemented a tensor from scratch as well and integrate it to my neural networ
 
 For more details about tensor, please refer to [tensor tutorial](docs/tensor.md).
 
+## Sequential Container
+
+To simply create your own neural network by stacking layers, feel free to use [`Sequential`](include/modules/containers/sequential.hpp). It is similar to keras `Sequential` (Although this repo should be a pytorch-like implementation :) )
+
+### Example Usage
+
+```cpp
+#include "sequential.hpp"
+#include "module.hpp"
+#include "linear.hpp"
+#include "relu.hpp"
+#include "dropout.hpp"
+#include <vector>
+using namespace std;
+
+vector<Module*> layers = { new Linear(768, 256), 
+                           new ReLU(), 
+                           new Dropout(0.2),
+                           new Linear(256, 128),
+                           new ReLU(),
+                           new Dropout(0.2),
+                           new Linear(128, 10),
+                         }
+
+Sequential container = layers;
+
+/*
+To perform forward pass, simply do 'output = container(input)'
+
+Similarily, do 'container.backward(grad_output)' to perform backward pass.
+
+For more details, please check main.cpp in examples
+*/
+```
+
 ## Module API
 
 The module API is defined in [`include/core/module.hpp`](include/core/module.hpp).
@@ -74,7 +111,7 @@ To build your custom module, follow the instructions in `include/core/module.hpp
 ### Example usage
 
 ```cpp
-#include <module.hpp>
+#include "module.hpp"
 using namespace nn;
 
 // Your code here

From 8e029de1b1260d54a5e175e868262431d04c4a61 Mon Sep 17 00:00:00 2001
From: lucaswychan <chanconan@gmail.com>
Date: Thu, 15 May 2025 23:21:01 +0800
Subject: [PATCH 17/17] refractor

---
 examples/test_linear.cpp  |  46 ++++----
 examples/test_softmax.cpp |   2 +-
 examples/test_tensor.cpp  | 224 ++++++++++++++++++++++----------------
 3 files changed, 150 insertions(+), 122 deletions(-)

diff --git a/examples/test_linear.cpp b/examples/test_linear.cpp
index fdd8c8e..0d7053b 100644
--- a/examples/test_linear.cpp
+++ b/examples/test_linear.cpp
@@ -5,34 +5,32 @@
 #include "dropout.hpp"
 using namespace nn;
 
-int main() {
-        const bool bias = true;
+int main()
+{
+    const bool bias = true;
 
     Linear linear_1(3, 5, bias);
     Linear linear_2(5, 7, bias);
-    Dropout dropout(0.3);
+    // Dropout dropout(0.3);
 
     Tensor<> specific_weights_1 = {
         {0.1, 0.4, 0.7, 1.0, 1.3},
         {0.2, 0.5, 0.8, 1.1, 1.4},
-        {0.3, 0.6, 0.9, 1.2, 1.5}
-    };
-    
+        {0.3, 0.6, 0.9, 1.2, 1.5}};
+
     Tensor<> specific_weights_2 = {
         {0.1, 0.6, 1.1, 1.6, 2.1, 2.6, 3.1},
         {0.2, 0.7, 1.2, 1.7, 2.2, 2.7, 3.2},
         {0.3, 0.8, 1.3, 1.8, 2.3, 2.8, 3.3},
         {0.4, 0.9, 1.4, 1.9, 2.4, 2.9, 3.4},
-        {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5}
-    };
+        {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5}};
 
     Tensor<> specific_bias_1 = {
         0.1,
         0.2,
         0.3,
         0.4,
-        0.5
-    };
+        0.5};
 
     Tensor<> specific_bias_2 = {
         0.1,
@@ -41,15 +39,13 @@ int main() {
         0.4,
         0.5,
         0.6,
-        0.7
-    };
+        0.7};
 
     Tensor<> input = {
         {1.1f, 2.1f, 3.1f},
         {4.1f, 5.1f, 6.1f},
         {7.1f, 8.1f, 9.1f},
-        {10.1f, 11.1f, 12.1f}
-    };
+        {10.1f, 11.1f, 12.1f}};
 
     cout << "After initialization: " << endl;
 
@@ -62,17 +58,17 @@ int main() {
     cout << "bias 2: " << endl;
     specific_bias_2.print();
 
-    linear_1.setWeights(specific_weights_1);
-    linear_2.setWeights(specific_weights_2);
+    linear_1.set_weight(specific_weights_1);
+    linear_2.set_weight(specific_weights_2);
 
-    linear_1.setBiases(specific_bias_1);
-    linear_2.setBiases(specific_bias_2);
+    linear_1.set_bias(specific_bias_1);
+    linear_2.set_bias(specific_bias_2);
 
     cout << endl;
 
     Tensor<> output_1 = linear_1(input);
-    Tensor<> output_2 = dropout(output_1);
-    Tensor<> Y_hat = linear_2(output_2);
+    // Tensor<> output_2 = dropout(output_1);
+    Tensor<> Y_hat = linear_2(output_1);
 
     cout << "Y_hat: " << endl;
     Y_hat.print();
@@ -83,8 +79,9 @@ int main() {
         {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0},
         {0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0},
         {0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0},
-        {0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0}
-    };
+        {0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0}};
+
+    Tensor<> Y_not_one_hot = {6, 4, 3, 2};
 
     MSE mse;
     CrossEntropyLoss cross_entropy_loss;
@@ -96,9 +93,8 @@ int main() {
 
     Tensor<> dL_dZ = cross_entropy_loss.backward();
     Tensor<> dL_dY_dot = linear_2.backward(dL_dZ);
-    Tensor<> dL_dY = dropout.backward(dL_dY_dot);
-    Tensor<> dL_dX = linear_1.backward(dL_dY);
-
+    // Tensor<> dL_dY = dropout.backward(dL_dY_dot);
+    Tensor<> dL_dX = linear_1.backward(dL_dY_dot);
 
     // ===================softmax=====================
 
diff --git a/examples/test_softmax.cpp b/examples/test_softmax.cpp
index a6ab89f..84ef3e2 100644
--- a/examples/test_softmax.cpp
+++ b/examples/test_softmax.cpp
@@ -69,7 +69,7 @@ int main() {
     Tensor<> output_softmax = softmax.forward(output_1);
     Tensor<> output_2 = linear_2.forward(output_softmax);
 
-    double cross_entropy_loss = criterion.forward(output_2, label);
+    float cross_entropy_loss = criterion.forward(output_2, label);
 
     cout << "cross entropy loss: " << cross_entropy_loss << endl;
 
diff --git a/examples/test_tensor.cpp b/examples/test_tensor.cpp
index 4a592e2..5a3b92b 100644
--- a/examples/test_tensor.cpp
+++ b/examples/test_tensor.cpp
@@ -1,131 +1,119 @@
-#include "tensor.hpp"
 #include <math.h>
+#include "tensor.hpp"
+#include <chrono>
+using namespace std::chrono;
+using namespace std;
 
-int main() {
-    Tensor<> tensor_1d {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
-    Tensor<> tensor_2d {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}};
-    Tensor<> tensor_3d {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, {{10.0f, 11.0f, 12.0f}, {13.0f, 14.0f, 15.0f}, {16.0f, 17.0f, 18.0f}}};
-
-    // Tensor<> tensor_random({2, 4}, -2.0f);
-
-    // Tensor<> tensor_negative {{-1.1f, -2.1f, -3.1f}, {-4.0f, -5.0f, -6.0f}, {-7.0f, -8.0f, -9.0f}};
-
-    // Tensor<> copy_tensor_2d = tensor_2d;
-
-    // cout << "Tensor 1D: " << endl;
-    // tensor_1d.print();
-    // cout << endl;
-
-    // cout << "Tensor 2D: " << endl;
-    // tensor_2d.print();
-    // cout << endl;
+int main()
+{
+    Tensor<> tensor_1d{1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+    cout << "finished creating tensor_1d" << endl;
 
-    // cout << "Tensor 3D: " << endl;
-    // tensor_3d.print();
-    // cout << endl;
+    Tensor<> tensor_2d{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}};
+    cout << "finished creating tensor_2d" << endl;
 
-    // cout << "Tensor Random: " << endl;
-    // tensor_random.print();
-    // cout << endl;
+    Tensor<> tensor_3d{{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, {{10.0f, 11.0f, 12.0f}, {13.0f, 14.0f, 15.0f}, {16.0f, 17.0f, 18.0f}}};
+    cout << "finished creating tensor_3d" << endl;
 
-    // Tensor<> transposed_2d = tensor_2d.transpose();
+    Tensor<> tensor_random({2, 4}, -2.0f);
+    cout << "finished creating tensor_random" << endl;
 
-    // cout << "Transposed Tensor 2D: " << endl;
-    // transposed_2d.print();
-    // cout << endl;
+    Tensor<> tensor_negative{{-1.1f, -2.1f, -3.1f}, {-4.0f, -5.0f, -6.0f}, {-7.0f, -8.0f, -9.0f}};
+    cout << "finished creating tensor_negative" << endl;
 
-    // Tensor<> transposed_1d = tensor_1d.transpose();
+    Tensor<> copy_tensor_2d = tensor_2d;
+    cout << "finished creating copy_tensor_2d" << endl;
 
-    // cout << "Transposed Tensor 1D: " << endl;
-    // transposed_1d.print();
-    // cout << endl;
+    cout << "Tensor 1D: " << endl;
+    tensor_1d.print();
+    cout << endl;
 
-    // Tensor<> mat_mul_tensor = tensor_2d.matmul(transposed_2d);
+    cout << "Tensor 2D: " << endl;
+    tensor_2d.print();
+    cout << endl;
 
-    // cout << "Matrix Multiplication Result: " << endl;
-    // mat_mul_tensor.print();
-    // cout << endl;
+    cout << "Tensor 3D: " << endl;
+    tensor_3d.print();
+    cout << endl;
 
-    // Tensor<> mul_tensor_vector = tensor_2d * transposed_2d;
+    cout << "Tensor Random: " << endl;
+    tensor_random.print();
+    cout << endl;
 
-    // cout << "Vector Multiplication Result: " << endl;
-    // mul_tensor_vector.print();
-    // cout << endl;
+    Tensor<> transposed_2d = tensor_2d.transpose();
 
-    // tensor_2d[1, 2] = 100.0f;
+    cout << "Transposed Tensor 2D: " << endl;
+    transposed_2d.print();
+    cout << endl;
 
-    // tensor_2d[0, 2] = 202.0f;
+    Tensor<> transposed_1d = tensor_1d.transpose();
 
-    // tensor_2d[2, 2] = 200.0f;
+    cout << "Transposed Tensor 1D: " << endl;
+    transposed_1d.print();
+    cout << endl;
 
-    // auto row1 = tensor_2d.row(1);
+    Tensor<> mat_mul_tensor = tensor_2d.matmul(transposed_2d);
 
-    // row1[0] = 300.0f;
-    // row1[1] = 400.0f;
-    // row1[2] = 500.0f;
-    // cout << endl;
+    cout << "Matrix Multiplication Result: " << endl;
+    mat_mul_tensor.print();
+    cout << endl;
 
-    // cout << "Updated Tensor 2D: " << endl;
-    // tensor_2d.print();
-    // cout << endl;
+    Tensor<> mul_tensor_vector = tensor_2d * transposed_2d;
 
-    // auto row_3d_1 = tensor_3d.col(0);
+    cout << "Vector Multiplication Result: " << endl;
+    mul_tensor_vector.print();
+    cout << endl;
 
-    // row_3d_1[0] = 300.0f;
-    // row_3d_1[1] = 400.0f;
-    // row_3d_1[2] = 400.0f;
-    // cout << endl;
+    Tensor<> tensor_positive = tensor_negative.abs();
 
+    cout << "Positive Tensor: " << endl;
+    tensor_positive.print();
+    cout << endl;
 
-    // cout << "Updated Tensor 3D: " << endl;
-    // tensor_3d.print();
-    // cout << endl;
+    cout << "copy tensor 2d: " << endl;
+    copy_tensor_2d.print();
+    cout << endl;
 
-    // Tensor<> tensor_positive = tensor_negative.abs();
+    Tensor<> origional_tensor_2d = tensor_2d.clone();
 
-    // cout << "Positive Tensor: " << endl;
-    // tensor_positive.print();
-    // cout << endl;
+    tensor_2d += tensor_2d;
 
-    // cout << "copy tensor 2d: " << endl;
-    // copy_tensor_2d.print();
-    // cout << endl;
+    cout << "tensor_2d += tensor_2d: " << endl;
+    tensor_2d.print();
+    cout << endl;
 
-    // tensor_2d += tensor_2d;
-    // tensor_2d[1, 1] = 1200.14f;
-    // tensor_2d[1, 0] = 1300.14f;
+    Tensor<> filtered_tensor = tensor_3d.filter([](float value)
+                                                { return value <= 10.0f; });
 
-    // cout << "tensor_2d += tensor_2d: " << endl;
-    // tensor_2d.print();
-    // cout << endl;
+    cout << "fitlered_values <= 10: " << endl;
+    filtered_tensor.print();
+    cout << endl;
 
-    // Tensor<> filtered_tensor = tensor_3d.filter([](double value) {
-    //     return value <= 10.0f;
-    // });
+    cout << "finding max value in tensor_2d: " << endl;
 
-    // cout << "fitlered_values: " << endl;
-    // filtered_tensor.print();
-    // cout << endl;
+    Tensor<size_t> max_tensor_2d = tensor_2d.argmin();
 
-    // cout << "finding max value in tensor_2d: " << endl;
+    cout << "max_tensor_2d: " << endl;
+    max_tensor_2d.print();
+    cout << endl;
 
-    // Tensor<size_t> max_tensor_2d = tensor_2d.argmin();
+    Tensor<size_t> max_tensor_1d = tensor_1d.argmin();
 
-    // cout << "max_tensor_2d: " << endl;
-    // max_tensor_2d.print();
-    // cout << endl;
+    cout << "max_tensor_1d: " << endl;
+    max_tensor_1d.print();
+    cout << endl;
 
-    // Tensor<size_t> max_tensor_1d = tensor_1d.argmin();
+    Tensor<int> equal_tensor_2d = tensor_2d.equal(tensor_2d);
 
-    // cout << "max_tensor_1d: " << endl;
-    // max_tensor_1d.print();
-    // cout << endl;
+    cout << "equal_tensor_2d: " << endl;
+    equal_tensor_2d.print();
+    cout << endl;
 
-    // Tensor<int> equal_tensor = tensor_2d.equal(tensor_2d);
+    Tensor<int> equal_tensor_2d_original_transposed = origional_tensor_2d.equal(transposed_2d);
 
-    // cout << "equal_tensor: " << endl;
-    // equal_tensor.print();
-    // cout << endl;
+    cout << "equal_tensor_2d_original_transposed: " << endl;
+    equal_tensor_2d_original_transposed.print();
+    cout << endl;
 
     Tensor<> first_row_tensor_3d = tensor_3d.index({":", 0u, ":"});
 
@@ -133,15 +121,59 @@ int main() {
     first_row_tensor_3d.print();
     cout << endl;
 
-    double last_value = tensor_3d[-1, 0, 1];
+    float last_value = tensor_3d[-1, 0, 1];
     cout << "last_value: " << last_value << endl;
 
-    Tensor<> A = { { 1, 2, 3 },
-                  { 4, 5, 6 } }; // 2 x 3
+    Tensor<> A = {{1, 2, 3},
+                  {4, 5, 6}}; // 2 x 3
 
-    Tensor<> A_mapped = A.map([](double x) { return exp(x); });
+    Tensor<> A_mapped = A.map([](float x)
+                              { return exp(x); });
 
     A_mapped.print();
 
+    Tensor<> new_tensor_3d = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
+    Tensor<> tensor_3d_log = new_tensor_3d.map([](float x)
+                                               { return log(x); });
+
+    cout << "tensor_3d_log: " << endl;
+    tensor_3d_log.print();
+    cout << endl;
+
+    auto start = high_resolution_clock::now();
+
+    // Tensor<> really_large_tensor = Tensor<>({100, 100, 100}, 1.0f);
+
+    // for (size_t i = 0; i < 100; i++) {
+    //     cout << "iteration: " << i << endl;
+    //     // really_large_tensor = really_large_tensor.map([](float x) { return exp(x); });
+    //     Tensor<> really_large_tensor_transpose = really_large_tensor.transpose();
+    //     size_t num_elements = really_large_tensor.size();
+    // }
+
+    // auto stop = high_resolution_clock::now();
+    // auto duration = duration_cast<microseconds>(stop - start);
+
+    // cout << "Time required : " << duration.count() << endl;
+
+    Tensor<> permuted_tensor_3d = tensor_3d.permute(1, 0, 2);
+
+    cout << "permuted_tensor_3d: " << endl;
+    permuted_tensor_3d.print();
+    cout << endl;
+
+    Tensor<> reshaped_permuted_tensor_3d = permuted_tensor_3d.reshape({3, 6});
+
+    cout << "reshaped permuted_tensor_3d: " << endl;
+    reshaped_permuted_tensor_3d.print();
+    cout << endl;
+
+    Tensor<> flattened_reshaped_permuted_tensor_3d = reshaped_permuted_tensor_3d.flatten();
+
+    cout << "flattened_reshaped_permuted_tensor_3d: " << endl;
+    flattened_reshaped_permuted_tensor_3d.print();
+    cout << endl;
+
+
     return 0;
 }
\ No newline at end of file