diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c09c2f..cc64f52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,16 +31,19 @@ include_directories(
 
 # Add source files
 set(SOURCE_FILES
-    src/core/module.cpp
     src/core/tensor.cpp
+    src/utils/tensor_utils.cpp
+    src/core/module.cpp
     src/modules/layers/linear.cpp
+    src/modules/layers/conv2d.cpp
+    src/modules/layers/flatten.cpp
+    src/utils/conv2d_utils.cpp
     src/modules/layers/dropout.cpp
     src/modules/layers/conv2d.cpp
     src/modules/losses/mse.cpp
     src/modules/activations/relu.cpp
     src/modules/activations/softmax.cpp
     src/modules/losses/cross_entropy.cpp
-    src/utils/tensor_utils.cpp
     src/datasets/mnist.cpp
     src/models/mlp.cpp
     src/metrics/accuracy.cpp
@@ -51,7 +54,7 @@ set(SOURCE_FILES
 add_library(neuralnet ${SOURCE_FILES})
 
 # Add the executable for the main example
-add_executable(main examples/test_tensor.cpp)
+add_executable(main examples/test_conv2d.cpp)
 target_link_libraries(main neuralnet)
 
 # Only build tests if BUILD_TESTS is ON
diff --git a/debug.sh b/debug.sh
new file mode 100755
index 0000000..2cf2f4d
--- /dev/null
+++ b/debug.sh
@@ -0,0 +1,4 @@
+cd build/
+cmake -DCMAKE_BUILD_TYPE=Debug ..
+make
+lldb main
\ No newline at end of file
diff --git a/examples/main.cpp b/examples/main.cpp
index 1c62a61..57a301f 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -6,7 +6,8 @@
 #include "utils.hpp"
 using namespace nn;
 
-int main() {
+int main()
+{
 
     // Define the hyperparameters
 
@@ -21,7 +22,8 @@ int main() {
     const string mnist_label_file = "../data/mnist/train-labels.idx1-ubyte";
 
     // load MNIST data
-    if (!dataset.load_data(mnist_image_file, mnist_label_file)) {
+    if (!dataset.load_data(mnist_image_file, mnist_label_file))
+    {
         cerr << "Failed to load dataset" << endl;
         return 1;
     }
@@ -29,23 +31,31 @@ int main() {
     // Initialize the model
     MLP model = MLP({784, 128, 64, 10}, DROPOUT_P);
 
+    cout << "Finished model initialization" << endl;
+
     // Define the loss function
     CrossEntropyLoss criterion = CrossEntropyLoss();
 
+    cout << "Finished loss initialization" << endl;
+
     double loss = 0.0;
     double acc = 0.0;
     vector<double> loss_list;
     vector<double> accuracy_list;
 
+    cout << "Training started..." << endl;
+
     // // Train the model
     // Example of iterating through all batches
-    for (size_t e = 0; e < EPOCH; e++) {
+    for (size_t e = 0; e < EPOCH; e++)
+    {
         cout << "\nEpoch " << e + 1 << ":\n";
-        dataset.reset();  // Reset batch counter at the start of each epoch
+        dataset.reset(); // Reset batch counter at the start of each epoch
         loss_list.clear();
         accuracy_list.clear();
-        
-        for (size_t i = 0; i < dataset.get_num_batches(); i++) {
+
+        for (size_t i = 0; i < dataset.get_num_batches(); i++)
+        {
             auto batch = dataset.get_next_batch();
             auto [data, labels] = batch.to_tensor();
 
@@ -53,9 +63,7 @@ int main() {
             Tensor<> output = model(data);
 
             loss = criterion(output, labels);
-            // cout << "After loss" << endl;
             acc = metrics::accuracy(output, labels);
-            // cout << "After acc" << endl;
 
             accuracy_list.push_back(acc);
             loss_list.push_back(loss);
@@ -66,7 +74,7 @@ int main() {
             model.update_params(LR);
 
             // print the training stats
-            print_training_stats_line(i, loss, acc);
+            print_stats_line(i, loss, acc);
         }
 
         double total_loss = accumulate(loss_list.begin(), loss_list.end(), 0.0) / loss_list.size();
@@ -78,5 +86,51 @@ int main() {
         cout << "------------------------------------" << endl;
     }
 
+    // Inference
+
+    model.eval();
+
+    const string mnist_image_file_test = "../data/mnist/t10k-images.idx3-ubyte";
+    const string mnist_label_file_test = "../data/mnist/t10k-labels.idx1-ubyte";
+
+    MNIST test_dataset(BATCH_SIZE);
+
+    if (!test_dataset.load_data(mnist_image_file_test, mnist_label_file_test))
+    {
+        cerr << "Failed to load test dataset" << endl;
+        return 1;
+    }
+
+    cout << "\n------------------------------------" << endl;
+    cout << "Testing started..." << endl;
+
+    loss = 0.0;
+    acc = 0.0;
+    loss_list.clear();
+    accuracy_list.clear();
+
+    for (size_t i = 0; i < test_dataset.get_num_batches(); i++)
+    {
+        auto batch = test_dataset.get_next_batch();
+        auto [data, labels] = batch.to_tensor();
+
+        // forward propagation
+        Tensor<> output = model(data);
+
+        loss = criterion(output, labels);
+        acc = metrics::accuracy(output, labels);
+
+        accuracy_list.push_back(acc);
+        loss_list.push_back(loss);
+
+        // print the testing stats
+        print_stats_line(i, loss, acc);
+    }
+
+    double total_loss = accumulate(loss_list.begin(), loss_list.end(), 0.0) / loss_list.size();
+    double total_acc = accumulate(accuracy_list.begin(), accuracy_list.end(), 0.0) / accuracy_list.size() * 100;
+
+    cout << "------------------------------------" << endl;
+
     return 0;
 }
diff --git a/examples/test_conv2d.cpp b/examples/test_conv2d.cpp
new file mode 100644
index 0000000..f9bbd43
--- /dev/null
+++ b/examples/test_conv2d.cpp
@@ -0,0 +1,236 @@
+#include "conv2d.hpp"
+#include "flatten.hpp"
+#include "linear.hpp"
+#include "cross_entropy.hpp"
+using namespace nn;
+
+int main()
+{
+    size_t batch_size = 2;
+    size_t input_data_size = 15;
+
+    size_t in_channels = 4;
+    size_t out_channels = 8;
+    size_t weight_size = 3;
+    size_t padding = 3;
+    size_t stride = 2;
+    size_t dilation = 2;
+    string padding_mode = "zeros";
+    bool use_bias = true;
+
+    size_t out_channels_2 = 7;
+    size_t weight_size_2 = 3;
+    size_t padding_2 = 3;
+    size_t stride_2 = 2;
+    size_t dilation_2 = 4;
+    string padding_mode_2 = "zeros";
+    bool use_bias_2 = true;
+
+    size_t in_features = out_channels_2 * input_data_size * input_data_size;
+    size_t out_features = 10;
+
+    Tensor<> test_weight = Tensor<>({out_channels, in_channels, weight_size, weight_size}, 0.0f);
+    Tensor<> test_bias = Tensor<>({out_channels}, 0.0f);
+
+    double val = 0.01;
+    for (size_t i = 0; i < out_channels; i++)
+    {
+        for (size_t j = 0; j < in_channels; j++)
+        {
+            for (size_t k = 0; k < weight_size; k++)
+            {
+                for (size_t l = 0; l < weight_size; l++)
+                {
+                    test_weight[i, j, k, l] = val;
+                    val += 0.01;
+                }
+            }
+        }
+    }
+
+    val = 0.01;
+    for (size_t i = 0; i < out_channels; i++)
+    {
+        test_bias[i] = val;
+        val += 0.01;
+    }
+
+    Tensor<> test_weight_2 = Tensor<>({out_channels_2, out_channels, weight_size_2, weight_size_2}, 0.0f);
+    Tensor<> test_bias_2 = Tensor<>({out_channels_2}, 0.0f);
+
+    val = 0.01;
+    for (size_t i = 0; i < out_channels_2; i++)
+    {
+        for (size_t j = 0; j < out_channels; j++)
+        {
+            for (size_t k = 0; k < weight_size_2; k++)
+            {
+                for (size_t l = 0; l < weight_size_2; l++)
+                {
+                    test_weight_2[i, j, k, l] = val;
+                    val += 0.01;
+                }
+            }
+        }
+    }
+
+    val = 0.01;
+    for (size_t i = 0; i < out_channels_2; i++)
+    {
+        test_bias_2[i] = val;
+        val += 0.01;
+    }
+
+    Tensor<> test_input = Tensor<>({batch_size, in_channels, input_data_size, input_data_size}, 0.0f);
+
+    val = 0.01;
+    for (size_t i = 0; i < batch_size; i++)
+    {
+        for (size_t j = 0; j < in_channels; j++)
+        {
+            for (size_t k = 0; k < input_data_size; k++)
+            {
+                for (size_t l = 0; l < input_data_size; l++)
+                {
+                    test_input[i, j, k, l] = val;
+                    val += 0.01;
+                }
+            }
+        }
+    }
+
+    // cout << "Test input: " << endl;
+    // test_input.print();
+    // cout << endl;
+
+    // cout << "Test weight: " << endl;
+    // test_weight.print();
+    // cout << endl;
+
+    // cout << "Test bias: " << endl;
+    // test_bias.print();
+    // cout << endl;
+
+    // cout << "Test weight 2: " << endl;
+    // test_weight_2.print();
+    // cout << endl;
+
+    // cout << "Test bias 2: " << endl;
+    // test_bias_2.print();
+    // cout << endl;
+
+    Conv2d conv2d_1(in_channels, out_channels, weight_size, padding, stride, dilation, padding_mode, use_bias);
+    Conv2d conv2d_2(out_channels, out_channels_2, weight_size_2, padding_2, stride_2, dilation_2, padding_mode_2, use_bias_2);
+    Flatten flatten;
+    CrossEntropyLoss cross_entropy;
+
+    conv2d_1.set_weight(test_weight);
+    conv2d_1.set_bias(test_bias);
+
+    conv2d_2.set_weight(test_weight_2);
+    conv2d_2.set_bias(test_bias_2);
+
+    // cout << "Test input: " << endl;
+    // test_input.print();
+    // cout << endl;
+
+    Tensor<> output = conv2d_1(test_input);
+    Tensor<> output_2 = conv2d_2(output);
+
+    cout << "Output: " << endl;
+    output.print();
+    cout << endl;
+
+    cout << "Output 2: " << endl;
+    output_2.print();
+    cout << endl;
+
+    cout << "output shape : ";
+    for (int i = 0; i < output.ndim(); i++)
+    {
+        cout << output.shapes()[i] << " ";
+    }
+    cout << endl;
+
+    cout << "output_2 shape : ";
+    for (int i = 0; i < output_2.ndim(); i++)
+    {
+        cout << output_2.shapes()[i] << " ";
+    }
+    cout << endl;
+
+    Tensor<> flattened_output_2 = flatten(output_2);
+
+    cout << "Flattened output 2: " << endl;
+    flattened_output_2.print();
+    cout << endl;
+
+    Linear linear(flattened_output_2.shapes()[1], out_features, false);
+
+    Tensor<> test_linear_weight({flattened_output_2.shapes()[1], out_features}, 0.0);
+
+    val = 0.01;
+    for (size_t i = 0; i < flattened_output_2.shapes()[1]; ++i)
+    {
+        for (size_t j = 0; j < out_features; ++j)
+        {
+            test_linear_weight[i, j] = val;
+            val += 0.01;
+        }
+    }
+
+    cout << "linear in features: " << flattened_output_2.shapes()[1] << endl;
+    cout << "linear out features: " << out_features << endl;
+
+    linear.set_weight(test_linear_weight);
+
+    cout << "Test linear weight: " << endl;
+    test_linear_weight.print();
+    cout << endl;
+
+    Tensor<> output_3 = linear(flattened_output_2);
+
+    cout << "Output 3: " << endl;
+    output_3.print();
+    cout << endl;
+
+    Tensor<> labels({batch_size}, 0);
+
+    val = 1;
+    for (size_t i = 0; i < batch_size; i++)
+    {
+        labels[i] = val;
+        val++;
+    }
+
+    // output.print();
+    // output_2.print();
+    // output_3.print();
+
+    output_3 /= 1e6;
+
+    double loss = cross_entropy(output_3, labels);
+
+    cout << "Loss: " << loss << endl;
+
+    Tensor<> dL_dV_2 = cross_entropy.backward();
+    Tensor<> dL_dV_1 = linear.backward(dL_dV_2);
+    Tensor<> dL_dZ = flatten.backward(dL_dV_1);
+
+    cout << "dL/dZ: " << endl;
+    dL_dZ.print();
+    cout << endl;
+
+    Tensor<> dL_dY = conv2d_2.backward(dL_dZ);
+    Tensor<> dL_dX = conv2d_1.backward(dL_dY);
+
+    cout << "dL_dY: " << endl;
+    dL_dY.print();
+    cout << endl;
+
+    cout << "dL_dX: " << endl;
+    dL_dX.print();
+    cout << endl;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/include/core/loss.hpp b/include/core/loss.hpp
index d35d8de..4f3eb14 100644
--- a/include/core/loss.hpp
+++ b/include/core/loss.hpp
@@ -4,6 +4,7 @@
 namespace nn {
 class Loss {
     public:
+        Loss() = default;
         virtual ~Loss() = default;
         virtual double forward(const Tensor<>& Y_hat, const Tensor<>& Y) = 0;
         virtual Tensor<> backward() = 0;
@@ -11,7 +12,6 @@ class Loss {
 
 
     protected:
-        Tensor<> grad_output_;
         Tensor<> Y_cache_;
         Tensor<> Y_hat_cache_;
     };
diff --git a/include/core/module.hpp b/include/core/module.hpp
index 986cf16..ea7b266 100644
--- a/include/core/module.hpp
+++ b/include/core/module.hpp
@@ -10,6 +10,7 @@ namespace nn {
 
 class Module {
     public:
+        Module() = default;
         /**
          * Virtual destructor for the Module class.
          */
diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index 8310ced..00f201f 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -1,906 +1,1274 @@
 #pragma once
-#include <initializer_list>
-#include <memory>
 #include "tensor_utils.hpp"
 using namespace std;
 
-
 template <typename T = double>
-class Tensor {
-    private:
-        shared_ptr<vector<T>> data_ = make_shared<vector<T>>(); // data is stored as a 1D vector // shared_ptr is used to avoid copying data
-        vector<size_t> shapes_; // store the dimensions of the tensor
-        vector<size_t> strides_; // store the strides of the tensor
-        size_t offset_ = 0; // offset for slicing
-
-        // Helper function to calculate the index in the 1D vector for a given set of indices expressed in the form of N-D vector
-        size_t calculate_idx(const vector<size_t>& idxs) const {
-            size_t idx = this->offset_;
-            for (size_t i = 0; i < idxs.size(); ++i) {
-                idx += idxs[i] * this->strides_[i];
-            }
-            return idx;
+class Tensor
+{
+private:
+    shared_ptr<vector<T>> data_ = nullptr; // data is stored as a 1D vector // shared_ptr is used to avoid copying data
+    vector<size_t> shape_;                 // store the dimensions of the tensor
+    vector<size_t> strides_;               // store the strides of the tensor
+    size_t offset_ = 0;                    // offset for slicing
+    mutable int64_t size_ = -1;            // it can be changed by const member functions (in size() function)
+
+    // Helper function to calculate the index in the 1D vector for a given set of indices expressed in the form of N-D vector
+    size_t calculate_idx(const vector<size_t> &idxs) const
+    {
+        size_t idx = this->offset_;
+        for (size_t i = 0; i < idxs.size(); ++i)
+        {
+            idx += idxs[i] * this->strides_[i];
+        }
+        return idx;
+    }
+
+    // Helper function for printing since we don't know the number of dimensions
+    void print_recursive_impl(size_t dim, size_t offset, int indent = 0) const
+    {
+        const string indent_str(indent, ' ');
+
+        // Handle empty dimensions
+        if (this->shape_[dim] == 0)
+        {
+            cout << indent_str << "[]";
+            return;
         }
 
-        // Helper function for printing since we don't know the number of dimensions
-        void print_recursive_impl(size_t dim, size_t offset, int indent = 0) const {
-            const string indent_str(indent, ' ');
+        cout << indent_str << "[";
 
-            // Handle empty dimensions
-            if (this->shapes_[dim] == 0) {
-                cout << indent_str << "[]";
-                return;
+        if (dim == this->ndim() - 1)
+        { // Last dimension
+            for (size_t i = 0; i < this->shape_[dim]; ++i)
+            {
+                cout << (*this->data_)[offset + i * this->strides_[dim]];
+                if (i < this->shape_[dim] - 1)
+                    cout << ", ";
             }
-
-            cout << indent_str << "[";
-            
-            if (dim == this->ndim() - 1) { // Last dimension
-                for (size_t i = 0; i < this->shapes_[dim]; ++i) {
-                    cout << (*this->data_)[offset + i * this->strides_[dim]];
-                    if (i < this->shapes_[dim] - 1) cout << ", ";
-                }
-            } else {
-                cout << "\n";
-                for (size_t i = 0; i < this->shapes_[dim]; ++i) {
-                    print_recursive_impl(dim + 1, offset + i * this->strides_[dim], indent + 2);
-                    if (i < this->shapes_[dim] - 1) cout << ",\n";
-                }
-                cout << "\n" << indent_str;
+        }
+        else
+        {
+            cout << "\n";
+            for (size_t i = 0; i < this->shape_[dim]; ++i)
+            {
+                print_recursive_impl(dim + 1, offset + i * this->strides_[dim], indent + 2);
+                if (i < this->shape_[dim] - 1)
+                    cout << ",\n";
             }
-            cout << "]";
+            cout << "\n"
+                 << indent_str;
+        }
+        cout << "]";
+    }
+
+    // Helper function for operator[] overloading
+    template <typename... Indices>
+    const vector<size_t> get_idxs(Indices... indices) const
+    {
+        // Convert variadic arguments to vector
+        vector<int64_t> idxs({static_cast<int64_t>(indices)...});
+        vector<size_t> normalized_idxs;
+
+        // for better performance, reserve the size of the vector
+        normalized_idxs.reserve(idxs.size());
+
+        // Check bounds
+        for (size_t i = 0; i < idxs.size(); ++i)
+        {
+            size_t normalized_idx = normalize_index(idxs[i], this->shape_[i]);
+            normalized_idxs.push_back(normalized_idx);
         }
 
-        // Helper function for operator[] overloading
-        template<typename... Indices>
-        const vector<size_t> get_idxs(Indices... indices) const {
-            // Convert variadic arguments to vector
-            vector<int64_t> idxs({static_cast<int64_t>(indices)...});
-            vector<size_t> normalized_idxs;
+        return normalized_idxs;
+    }
+
+    /**
+     * Reduces a 1D or 2D tensor along its rows using the specified reduction operation.
+     *
+     * @tparam U The data type of the resulting tensor. Defaults to the type of the current tensor.
+     * @param op The reduction operation to perform. Supported operations are MAX, MIN, ARGMAX, and ARGMIN.
+     * @return A Tensor<U> of shape (num_rows, 1) containing the reduced values or indices.
+     * @throws runtime_error if the tensor's number of dimensions is greater than 2.
+     */
+
+    template <typename U = T>
+    Tensor<U> reduce_impl(ReduceOp op) const
+    {
+        const size_t ndim = this->ndim();
+
+        if (ndim > 2)
+        {
+            throw std::runtime_error("Only 1D and 2D tensors are supported for reduce");
+        }
 
-            // for better performance, reserve the size of the vector
-            normalized_idxs.reserve(idxs.size());
+        // Determine tensor dimensions
+        const size_t num_rows = (ndim == 2) ? this->shape_[0] : 1;
+        const size_t num_cols = (ndim == 2) ? this->shape_[1] : this->shape_[0];
 
-            // Check bounds
-            for (size_t i = 0; i < idxs.size(); ++i) {
-                size_t normalized_idx = normalize_index(idxs[i], this->shapes_[i]);
-                normalized_idxs.push_back(normalized_idx);
-            }
+        vector<U> result(num_rows);
 
-            return normalized_idxs;
-        }
+        for (size_t i = 0; i < num_rows; ++i)
+        {
+            // Calculate base offset for current row
+            size_t row_offset = this->offset_;
+            if (ndim == 2)
+            {
+                row_offset += i * this->strides_[0];
+            }
 
-        /**
-         * Reduces a 1D or 2D tensor along its rows using the specified reduction operation.
-         *
-         * @tparam U The data type of the resulting tensor. Defaults to the type of the current tensor.
-         * @param op The reduction operation to perform. Supported operations are MAX, MIN, ARGMAX, and ARGMIN.
-         * @return A Tensor<U> of shape (num_rows, 1) containing the reduced values or indices.
-         * @throws runtime_error if the tensor's number of dimensions is greater than 2.
-         */
+            T extreme_val = (*this->data_)[row_offset];
+            size_t extreme_idx = 0;
 
-        template <typename U = T>
-        Tensor<U> reduce_impl(ReduceOp op) const {        
-            if (this->ndim() > 2) {
-                throw std::runtime_error("Only 1D and 2D tensors are supported for reduce");
-            }
+            // Process elements using stride-aware indexing
+            for (size_t j = 1; j < num_cols; ++j)
+            {
+                size_t elem_offset = row_offset;
+                if (ndim == 2)
+                {
+                    elem_offset += j * this->strides_[1];
+                }
+                else
+                {
+                    elem_offset += j * this->strides_[0];
+                }
 
-            const size_t num_rows = (this->ndim() == 2)? this->shapes_[0] : 1;
-            const size_t num_cols = (this->ndim() == 2)? this->shapes_[1] : this->shapes_[0];
-
-            
-            // Result will be a tensor of shape (num_rows, 1)
-            vector<U> result(num_rows);
-            
-            for (size_t i = 0; i < num_rows; i++) {
-                size_t start_idx = i * num_cols;
-                
-                // Initialize with first element in row
-                T extreme_val = (*this->data_)[start_idx];
-                size_t extreme_idx = 0;
-                
-                // Process remaining elements in the row
-                for (size_t j = 1; j < num_cols; j++) {
-                    size_t curr_idx = start_idx + j;
-                    bool update = false;
-                    
-                    switch (op) {
-                        case ReduceOp::MAX:
-                        case ReduceOp::ARGMAX:
-                            update = (*this->data_)[curr_idx] > extreme_val;
-                            break;
-                        case ReduceOp::MIN:
-                        case ReduceOp::ARGMIN:
-                            update = (*this->data_)[curr_idx] < extreme_val;
-                            break;
-                    }
-                    
-                    if (update) {
-                        extreme_val = (*this->data_)[curr_idx];
-                        extreme_idx = j;
-                    }
+                bool update = false;
+                switch (op)
+                {
+                case ReduceOp::MAX:
+                case ReduceOp::ARGMAX:
+                    update = (*this->data_)[elem_offset] > extreme_val;
+                    break;
+                case ReduceOp::MIN:
+                case ReduceOp::ARGMIN:
+                    update = (*this->data_)[elem_offset] < extreme_val;
+                    break;
                 }
-                
-                // Store the result
-                switch (op) {
-                    case ReduceOp::MAX:
-                    case ReduceOp::MIN:
-                        result[i] = extreme_val;
-                        break;
-                    case ReduceOp::ARGMAX:
-                    case ReduceOp::ARGMIN:
-                        result[i] = extreme_idx;
-                        break;
+
+                if (update)
+                {
+                    extreme_val = (*this->data_)[elem_offset];
+                    extreme_idx = j;
                 }
             }
-            
-            return Tensor<U>(result);
-        }
 
-        Tensor<T> arithmetic_operation_impl(ArithmeticOp op, const Tensor<T>& other) const {
-            if (other.shapes_ != this->shapes_) {
-                throw runtime_error("Shape mismatch in arithmetic operation");
+            switch (op)
+            {
+            case ReduceOp::MAX:
+            case ReduceOp::MIN:
+                result[i] = static_cast<U>(extreme_val);
+                break;
+            case ReduceOp::ARGMAX:
+            case ReduceOp::ARGMIN:
+                result[i] = static_cast<U>(extreme_idx);
+                break;
             }
+        }
 
-            size_t ndim = this->ndim();
-
-            Tensor<T> result(this->shapes_, static_cast<T>(0));
-
-            // Precompute result's contiguous strides for index calculation
-            const vector<size_t>& result_strides = result.strides_;
-
-            for (size_t i = 0; i < this->size(); i++) {
-                auto [a_offset, b_offset] = calculate_tensors_offsets(i, ndim, result_strides, other);
-
-                switch (op) {
-                    case ArithmeticOp::ADD:
-                        (*result.data_)[i] = (*this->data_)[a_offset] + (*other.data_)[b_offset];
-                        break;
-                    case ArithmeticOp::SUB:
-                        (*result.data_)[i] = (*this->data_)[a_offset] - (*other.data_)[b_offset];
-                        break;
-                    case ArithmeticOp::MUL:
-                        (*result.data_)[i] = (*this->data_)[a_offset] * (*other.data_)[b_offset];
-                        break;
-                    case ArithmeticOp::DIV:
-                        (*result.data_)[i] = (*this->data_)[a_offset] / (*other.data_)[b_offset];
-                        break;
-                }
+        return Tensor<U>(result);
+    }
+
+    Tensor<T> arithmetic_operation_impl(ArithmeticOp op, const Tensor<T> &other) const
+    {
+        if (other.shape_ != this->shape_)
+        {
+            throw runtime_error("Shape mismatch in arithmetic operation");
+        }
+
+        size_t ndim = this->ndim();
+
+        Tensor<T> result(this->shape_, static_cast<T>(0));
+
+        // Precompute result's contiguous strides for index calculation
+        const vector<size_t> &result_strides = result.strides_;
+
+        for (size_t i = 0; i < this->size(); i++)
+        {
+            auto [a_offset, b_offset] = calculate_tensors_offsets(i, ndim, result_strides, other);
+
+            switch (op)
+            {
+            case ArithmeticOp::ADD:
+                (*result.data_)[i] = (*this->data_)[a_offset] + (*other.data_)[b_offset];
+                break;
+            case ArithmeticOp::SUB:
+                (*result.data_)[i] = (*this->data_)[a_offset] - (*other.data_)[b_offset];
+                break;
+            case ArithmeticOp::MUL:
+                (*result.data_)[i] = (*this->data_)[a_offset] * (*other.data_)[b_offset];
+                break;
+            case ArithmeticOp::DIV:
+                (*result.data_)[i] = (*this->data_)[a_offset] / (*other.data_)[b_offset];
+                break;
             }
-            return result;
         }
+        return result;
+    }
 
-        // Helper function to cacluate the stride of the tensor
-        void calculate_strides() {
-            this->strides_.resize(this->ndim(), 0);
-            vector<size_t> strides(this->ndim());
+    // Helper function to cacluate the stride of the tensor
+    void compute_contiguous_strides()
+    {
+        this->strides_.resize(this->ndim(), 0);
 
-            int64_t stride = 1;
+        int64_t stride = 1;
 
-            for (int64_t i = this->ndim() - 1; i >= 0; --i) {
-                this->strides_[i] = stride;
-                stride *= this->shapes_[i];
-            }
+        for (int64_t i = this->ndim() - 1; i >= 0; --i)
+        {
+            this->strides_[i] = stride;
+            stride *= this->shape_[i];
         }
+    }
 
-        std::tuple<size_t, size_t> calculate_tensors_offsets(const size_t idx, const size_t ndim, const vector<size_t>& result_strides, const Tensor<T>& other) const {
-            vector<size_t> indices(ndim);
+    std::tuple<size_t, size_t> calculate_tensors_offsets(const size_t idx, const size_t ndim, const vector<size_t> &result_strides, const Tensor<T> &other) const
+    {
+        vector<size_t> indices(ndim);
 
-            size_t remaining = idx;
+        size_t remaining = idx;
 
-            for (int dim = 0; dim < ndim; ++dim) {
-                indices[dim] = remaining / result_strides[dim];
-                remaining %= result_strides[dim];
-            }
+        for (int64_t dim = 0; dim < ndim; ++dim)
+        {
+            indices[dim] = remaining / result_strides[dim];
+            remaining %= result_strides[dim];
+        }
 
-            // Calculate offsets using original tensors' strides
-            size_t a_offset = this->offset_;
-            size_t b_offset = other.offset_;
+        // Calculate offsets using original tensors' strides
+        size_t a_offset = this->offset_;
+        size_t b_offset = other.offset_;
 
-            for (int dim = 0; dim < ndim; ++dim) {
-                a_offset += indices[dim] * this->strides_[dim];
-                b_offset += indices[dim] * other.strides_[dim];
-            }
+        for (int64_t dim = 0; dim < ndim; ++dim)
+        {
+            a_offset += indices[dim] * this->strides_[dim];
+            b_offset += indices[dim] * other.strides_[dim];
+        }
 
-            return {a_offset, b_offset};
-        }
-
-        // Declare friendship so that TensorView can access private members of Tensor
-        template<typename U, typename V>
-        friend Tensor<V> dtype_impl(const Tensor<U>& tensor);
-
-    public:
-        Tensor() = default;
-
-        // Helper to recursively flatten nested vectors and compute shapes
-        template<typename V>
-        void flatten_vector(const std::vector<V>& vec, size_t depth = 0) {
-            // Add current level's size to shapes
-            if (depth == this->shapes_.size()) {
-                // First encounter with this depth: record size
-                this->shapes_.push_back(vec.size());
-                
-            } 
-            else {
-                // Verify size matches the existing dimension
-                if (vec.size() != this->shapes_[depth]) {
-                    throw std::invalid_argument("Inconsistent shape at depth " + std::to_string(depth));
-                }
+        return {a_offset, b_offset};
+    }
+
+    // Helper to recursively flatten nested vectors and compute shapes
+    template <typename V>
+    void flatten_vector(const std::vector<V> &vec, size_t depth = 0)
+    {
+        // Add current level's size to shapes
+        if (depth == this->shape_.size())
+        {
+            // First encounter with this depth: record size
+            this->shape_.push_back(vec.size());
+        }
+        else
+        {
+            // Verify size matches the existing dimension
+            if (vec.size() != this->shape_[depth])
+            {
+                throw std::invalid_argument("Inconsistent shape at depth " + std::to_string(depth));
             }
+        }
 
-            if constexpr (is_vector<V>::value) {
-                // Ensure nested vectors have consistent sizes at this level
-                if (!vec.empty()) {
-                    size_t expected_size = vec[0].size();
-                    for (const auto& elem : vec) {
-                        if (elem.size() != expected_size) {
-                            throw std::invalid_argument("Inconsistent shape in nested vectors");
-                        }
+        if constexpr (is_vector<V>::value)
+        {
+            // Ensure nested vectors have consistent sizes at this level
+            if (!vec.empty())
+            {
+                size_t expected_size = vec[0].size();
+                for (const auto &elem : vec)
+                {
+                    if (elem.size() != expected_size)
+                    {
+                        throw std::invalid_argument("Inconsistent shape in nested vectors");
                     }
                 }
+            }
 
-                // Recurse into nested vectors
-                for (const auto& elem : vec) {
-                    flatten_vector(elem, depth + 1);
-                }
-            } else {
-                // Ensure leaf elements match the Tensor's data type
-                // static_assert(std::is_same_v<V, T>, "Element type must match Tensor type");
-                for (const auto& elem : vec) {
-                    this->data_->push_back(static_cast<T>(elem));
-                }
+            // Recurse into nested vectors
+            for (const auto &elem : vec)
+            {
+                flatten_vector(elem, depth + 1);
             }
         }
-
-        // Constructor for nested vectors
-        template<typename V>
-        Tensor(const std::vector<V>& input) {
-            flatten_vector(input);
-            this->calculate_strides();
-        }
-
-        // // Recursive helper to process nested initializer lists
-        // template<typename U>
-        // void flatten_list(const std::initializer_list<U>& list, size_t depth = 0) {
-        //     // Handle the current dimension
-        //     if (depth == shapes_.size()) {
-        //         // First encounter with this depth: record size
-        //         shapes_.push_back(list.size());
-        //     } else {
-        //         // Verify size matches the existing dimension
-        //         if (list.size() != shapes_[depth]) {
-        //             throw std::invalid_argument("Inconsistent shape at depth " + std::to_string(depth));
-        //         }
-        //     }
-
-        //     // Recurse or add data
-        //     if constexpr (is_list<U>::value) {
-        //         // Process nested lists
-        //         for (const auto& elem : list) {
-        //             flatten_list(elem, depth + 1);
-        //         }
-        //     } else {
-        //         // Ensure element type matches Tensor type
-        //         // static_assert(std::is_same_v<U, T>, "Element type must match Tensor type");
-        //         for (const auto& elem : list) {
-        //             data_.push_back(static_cast<T>(elem));
-        //         }
-        //     }
-        // }
-
-
-        // Scaler constructor
-        Tensor(const T& value) {
-            this->shapes_ = vector<size_t> {1};
-            this->data_ = make_shared<vector<T>>(1, value);
-            this->calculate_strides();
+        else
+        {
+            // Ensure leaf elements match the Tensor's data type
+            // static_assert(std::is_same_v<V, T>, "Element type must match Tensor type");
+            this->data_->reserve(this->data_->size() + vec.size());
+            for (const auto &elem : vec)
+            {
+                this->data_->emplace_back(static_cast<T>(elem));
+            }
         }
-
-        // 1D tensor constructor
-        Tensor(const initializer_list<T>& data_1d) {
-            this->data_ = make_shared<vector<T>>(data_1d.begin(), data_1d.end());
-            this->shapes_ = vector<size_t> { data_1d.size() };
-            this->calculate_strides();
+    }
+
+    // Declare friendship so that TensorView can access private members of Tensor
+    template <typename U, typename V>
+    friend Tensor<V> dtype_impl(const Tensor<U> &tensor);
+
+public:
+    Tensor() = default;
+
+    // Constructor for nested vectors
+    template <typename V>
+    Tensor(const std::vector<V> &input)
+    {
+        this->data_ = make_shared<vector<T>>();
+        flatten_vector(input);
+        this->compute_contiguous_strides();
+    }
+
+    // // Recursive helper to process nested initializer lists
+    // template<typename U>
+    // void flatten_list(const std::initializer_list<U>& list, size_t depth = 0) {
+    //     // Handle the current dimension
+    //     if (depth == shapes_.size()) {
+    //         // First encounter with this depth: record size
+    //         shapes_.push_back(list.size());
+    //     } else {
+    //         // Verify size matches the existing dimension
+    //         if (list.size() != shapes_[depth]) {
+    //             throw std::invalid_argument("Inconsistent shape at depth " + std::to_string(depth));
+    //         }
+    //     }
+
+    //     // Recurse or add data
+    //     if constexpr (is_list<U>::value) {
+    //         // Process nested lists
+    //         for (const auto& elem : list) {
+    //             flatten_list(elem, depth + 1);
+    //         }
+    //     } else {
+    //         // Ensure element type matches Tensor type
+    //         // static_assert(std::is_same_v<U, T>, "Element type must match Tensor type");
+    //         for (const auto& elem : list) {
+    //             data_.push_back(static_cast<T>(elem));
+    //         }
+    //     }
+    // }
+
+    // Scaler constructor
+    Tensor(const T &value)
+    {
+        this->shape_ = vector<size_t>{1};
+        this->data_ = make_shared<vector<T>>(1, value);
+        this->compute_contiguous_strides();
+    }
+
+    // 1D tensor constructor
+    Tensor(const initializer_list<T> &data_1d)
+    {
+        this->data_ = make_shared<vector<T>>(data_1d.begin(), data_1d.end());
+        this->shape_ = vector<size_t>{data_1d.size()};
+        this->compute_contiguous_strides();
+    }
+
+    // 2D tensor constructor
+    Tensor(const initializer_list<initializer_list<T>> &data_2d)
+    {
+        const size_t n = data_2d.size(), m = data_2d.begin()->size();
+
+        this->shape_ = vector<size_t>{n, m};
+
+        this->data_ = make_shared<vector<T>>();
+        this->data_->reserve(n * m); // Optimize memory allocation
+
+        for (const initializer_list<T> &row : data_2d)
+        {
+            this->data_->insert(this->data_->end(), row.begin(), row.end());
         }
+        this->compute_contiguous_strides();
+    }
 
-        // 2D tensor constructor
-        Tensor(const initializer_list<initializer_list<T>>& data_2d) {
-            const size_t n = data_2d.size(), m = data_2d.begin()->size();
+    // 3D tensor constructor
+    Tensor(const initializer_list<initializer_list<initializer_list<T>>> &data_3d)
+    {
+        const size_t n = data_3d.size(), m = data_3d.begin()->size(), l = data_3d.begin()->begin()->size();
 
-            this->shapes_ = vector<size_t> { n, m };
+        this->shape_ = vector<size_t>{n, m, l};
 
-            this->data_->reserve(n * m);  // Optimize memory allocation
+        this->data_ = make_shared<vector<T>>();
+        this->data_->reserve(n * m * l); // Optimize memory allocation
 
-            for (const initializer_list<T>& row : data_2d) {
+        for (const initializer_list<initializer_list<T>> &matrix : data_3d)
+        {
+            for (const initializer_list<T> &row : matrix)
+            {
                 this->data_->insert(this->data_->end(), row.begin(), row.end());
             }
-            this->calculate_strides();
         }
+        this->compute_contiguous_strides();
+    }
 
-        // 3D tensor constructor
-        Tensor(const initializer_list<initializer_list<initializer_list<T>>>& data_3d) {
-            const size_t n = data_3d.size(), m = data_3d.begin()->size(), l = data_3d.begin()->begin()->size();
+    // 4D tensor constructor
+    Tensor(const initializer_list<initializer_list<initializer_list<initializer_list<T>>>> &data_4d)
+    {
+        const size_t n = data_4d.size(), m = data_4d.begin()->size(), l = data_4d.begin()->begin()->size(), k = data_4d.begin()->begin()->begin()->size();
 
-            this->shapes_ = vector<size_t> { n, m, l };
+        this->shape_ = vector<size_t>{n, m, l, k};
 
-            this->data_->reserve(n * m * l);  // Optimize memory allocation
+        this->data_ = make_shared<vector<T>>();
+        this->data_->reserve(n * m * l * k); // Optimize memory allocation
 
-            for (const initializer_list<initializer_list<T>>& matrix : data_3d) {
-                for (const initializer_list<T>& row : matrix) {
+        for (const initializer_list<initializer_list<initializer_list<T>>> &tensor : data_4d)
+        {
+            for (const initializer_list<initializer_list<T>> &matrix : tensor)
+            {
+                for (const initializer_list<T> &row : matrix)
+                {
                     this->data_->insert(this->data_->end(), row.begin(), row.end());
                 }
             }
-            this->calculate_strides();
+        }
+        this->compute_contiguous_strides();
+    }
+
+    // certin value constructor
+    Tensor(const vector<size_t> &shape, const T &value)
+    {
+        this->shape_ = shape;
+        size_t size = 1;
+        for (const size_t &dim : shape)
+        {
+            size *= dim;
         }
 
-        // 4D tensor constructor
-        Tensor(const initializer_list<initializer_list<initializer_list<initializer_list<T>>>>& data_4d) {
-            const size_t n = data_4d.size(), m = data_4d.begin()->size(), l = data_4d.begin()->begin()->size(), k = data_4d.begin()->begin()->begin()->size();
-
-            this->shapes_ = vector<size_t> { n, m, l, k };
+        this->data_ = make_shared<vector<T>>(size, value);
+        this->compute_contiguous_strides();
+    }
+
+    // copy constructor
+    Tensor(const Tensor<T> &other)
+    {
+        // already overload the = operator
+        *this = other;
+    }
+
+    // template <typename V>
+    // Tensor(const Tensor<V> &other)
+    // {
+    //     // use dtype function to convert the data type
+    //     *this = other.dtype<>();
+    // }
+
+    // Add two tensors with same shape, element-wise
+    inline Tensor<T> add(const Tensor &other) const
+    {
+        return arithmetic_operation_impl(ArithmeticOp::ADD, other);
+    }
+
+    // Subtract two tensors with same shape, element-wise
+    inline Tensor<T> sub(const Tensor<T> &other) const
+    {
+        return arithmetic_operation_impl(ArithmeticOp::SUB, other);
+    }
+
+    // Multiply two tensors with same shape, element-wise
+    inline Tensor<T> mul(const Tensor<T> &other) const
+    {
+        return arithmetic_operation_impl(ArithmeticOp::MUL, other);
+    }
+
+    // Divide two tensors with same shape, element-wise
+    inline Tensor<T> div(const Tensor<T> &other) const
+    {
+        return arithmetic_operation_impl(ArithmeticOp::DIV, other);
+    }
+
+    // Multiply all elements of tensor with the given scaler
+    Tensor<T> mul(const T &scaler) const
+    {
+        Tensor<T> result = *this;
+
+        for (size_t i = 0; i < this->size(); i++)
+        {
+            (*result.data_)[i] *= scaler;
+        }
+        return result;
+    }
 
-            this->data_->reserve(n * m * l * k);  // Optimize memory allocation
+    // Divide all elements of tensor with the given scaler
+    Tensor<T> div(const T &scaler) const
+    {
+        Tensor<T> result = *this;
 
-            for (const initializer_list<initializer_list<initializer_list<T>>>& tensor : data_4d) {
-                for (const initializer_list<initializer_list<T>>& matrix : tensor) {
-                    for (const initializer_list<T>& row : matrix) {
-                        this->data_->insert(this->data_->end(), row.begin(), row.end());
-                    }
-                }
-            }
-            this->calculate_strides();
+        for (size_t i = 0; i < this->size(); i++)
+        {
+            (*result.data_)[i] /= scaler;
         }
-
-        // certin value constructor
-        Tensor(const vector<size_t>& shape, const T& value) {
-            this->shapes_ = shape;
-            size_t size = 1;
-            for (const size_t& dim : shape) {
-                size *= dim;
-            }
-            
-            this->data_->resize(size, value);
-            this->calculate_strides();
+        return result;
+    }
+
+    /**
+     * Matrix multiplication of two tensors.
+     *
+     * The two tensors must have at least two dimensions. The leading dimensions (all except last two) must be equal.
+     * The last two dimensions must match the matrix multiplication dimensions.
+     * For example, if the first tensor has shape [a, b, n, m] and the second tensor has shape [a, b, m, p], the result will have shape [a, b, n, p].
+     *
+     * The result is a tensor with the leading dimensions of the first tensor and the matrix multiplication result as the last two dimensions.
+     *
+     * The total number of batches is the product of the leading dimensions.
+     *
+     * The matrix multiplication is performed batched, i.e., for each batch, a matrix multiplication is performed.
+     *
+     * @param other The tensor to multiply with.
+     * @return The result of the matrix multiplication.
+     */
+    Tensor<T> matmul(const Tensor<T> &other) const
+    {
+        // Ensure both tensors have at least 2 dimensions
+        size_t A_ndim = this->ndim(), B_ndim = other.ndim();
+
+        if (A_ndim < 2 || B_ndim < 2)
+        {
+            throw std::runtime_error("Tensors must have at least 2 dimensions for matrix multiplication");
         }
 
-        // copy constructor
-        Tensor(const Tensor<T>& other) {
-            *this = other;
-        }
+        // Check leading dimensions (all except last two) are equal
+        const size_t A_leading_ndim = A_ndim - 2;
+        const size_t B_leading_ndim = B_ndim - 2;
 
-        // Add two tensors with same shape, element-wise
-        inline Tensor<T> add(const Tensor& other) const {
-            return arithmetic_operation_impl(ArithmeticOp::ADD, other);
+        if (A_leading_ndim != B_leading_ndim)
+        {
+            throw std::runtime_error("Number of leading dimensions must match");
         }
 
-        // Subtract two tensors with same shape, element-wise
-        inline Tensor<T> sub(const Tensor<T>& other) const {
-            return arithmetic_operation_impl(ArithmeticOp::SUB, other);
-        }
+        vector<size_t> A_leading_shape(this->shape_.begin(), this->shape_.end() - 2);
+        vector<size_t> B_leading_shape(other.shape_.begin(), other.shape_.end() - 2);
 
-        // Multiply two tensors with same shape, element-wise
-        inline Tensor<T> mul(const Tensor<T>& other) const {
-            return arithmetic_operation_impl(ArithmeticOp::MUL, other);
+        if (A_leading_shape != B_leading_shape)
+        {
+            throw invalid_argument("Batch dimensions must match");
         }
 
-        // Divide two tensors with same shape, element-wise
-        inline Tensor<T> div(const Tensor<T>& other) const {
-            return arithmetic_operation_impl(ArithmeticOp::DIV, other);
+        // Extract matrix dimensions
+        const size_t n = this->shape_[A_ndim - 2];
+        const size_t m = this->shape_[A_ndim - 1];
+        const size_t m_other = other.shape_[B_ndim - 2];
+        const size_t p = other.shape_[B_ndim - 1];
+
+        if (m != m_other)
+        {
+            throw std::invalid_argument("Matrix dimension mismatch: last dimension of first tensor must match second last of second tensor");
         }
 
-        // Multiply all elements of tensor with the given scaler
-        Tensor<T> mul(const T& scaler) const {
-            Tensor<T> result(this->shapes_, static_cast<T>(0));
+        // Determine result shape: leading dimensions + [n, p]
+        vector<size_t> result_shapes = A_leading_shape;
+        result_shapes.push_back(n);
+        result_shapes.push_back(p);
 
-            for (size_t i = 0; i < this->size(); i++) {
-                (*result.data_)[i] = (*this->data_)[i] * scaler;
-            }
-            return result;
-        }
-
-        
-        /**
-         * Matrix multiplication of two tensors.
-         *
-         * The two tensors must have at least two dimensions. The leading dimensions (all except last two) must be equal.
-         * The last two dimensions must match the matrix multiplication dimensions.
-         * For example, if the first tensor has shape [a, b, n, m] and the second tensor has shape [a, b, m, p], the result will have shape [a, b, n, p].
-         *
-         * The result is a tensor with the leading dimensions of the first tensor and the matrix multiplication result as the last two dimensions.
-         *
-         * The total number of batches is the product of the leading dimensions.
-         *
-         * The matrix multiplication is performed batched, i.e., for each batch, a matrix multiplication is performed.
-         *
-         * @param other The tensor to multiply with.
-         * @return The result of the matrix multiplication.
-         */
-        Tensor<T> matmul(const Tensor<T>& other) const {
-            // Ensure both tensors have at least 2 dimensions
-            size_t A_ndim = this->ndim(), B_ndim = other.ndim();
-
-            if (A_ndim < 2 || B_ndim < 2) {
-                throw std::runtime_error("Tensors must have at least 2 dimensions for matrix multiplication");
-            }
+        Tensor<T> result(result_shapes, static_cast<T>(0));
 
-            // Check leading dimensions (all except last two) are equal
-            const size_t A_leading_ndim = A_ndim - 2;
-            const size_t B_leading_ndim = B_ndim - 2;
+        // Compute total number of batches (product of leading dimensions)
+        // may be we can use divisoin in stride to have O(1) time
+        size_t total_batches = 1;
+        for (const size_t &dim : A_leading_shape)
+        {
+            total_batches *= dim;
+        }
 
-            if (A_leading_ndim != B_leading_ndim) {
-                throw std::runtime_error("Number of leading dimensions must match");
+        for (size_t batch = 0; batch < total_batches; ++batch)
+        {
+            // Get multi_dimensional indices for this batch
+            vector<size_t> indices = linear_to_multi_idxs(batch, A_leading_shape);
+
+            // Compute offsets for A, B, and result
+            size_t A_offset = this->offset_;
+            size_t B_offset = other.offset_;
+            size_t result_offset = 0;
+
+            for (size_t i = 0; i < A_leading_ndim; ++i)
+            {
+                A_offset += indices[i] * this->strides_[i];
+                B_offset += indices[i] * other.strides_[i];
+                result_offset += indices[i] * result.strides_[i];
             }
 
-            vector<size_t> A_leading_shape(this->shapes_.begin(), this->shapes_.end() - 2);
-            vector<size_t> B_leading_shape(other.shapes_.begin(), other.shapes_.end() - 2);
+            for (size_t i = 0; i < n; ++i)
+            {
+                for (size_t j = 0; j < p; ++j)
+                {
+                    T sum = static_cast<T>(0);
 
-            if (A_leading_shape != B_leading_shape) {
-                throw invalid_argument("Batch dimensions must match");
-            }
+                    for (size_t k = 0; k < m; ++k)
+                    {
+                        // Calculate offsets in A and B
+                        size_t a_idx = A_offset +
+                                       i * this->strides_[A_leading_ndim] +
+                                       k * this->strides_[A_leading_ndim + 1];
 
-            // Extract matrix dimensions
-            const size_t n = this->shapes_[A_ndim - 2];
-            const size_t m = this->shapes_[A_ndim - 1];
-            const size_t m_other = other.shapes_[B_ndim - 2];
-            const size_t p = other.shapes_[B_ndim - 1];
+                        size_t b_idx = B_offset +
+                                       k * other.strides_[B_leading_ndim] +
+                                       j * other.strides_[B_leading_ndim + 1];
 
-            if (m != m_other) {
-                throw std::invalid_argument("Matrix dimension mismatch: last dimension of first tensor must match second last of second tensor");
+                        sum += (*this->data_)[a_idx] * (*other.data_)[b_idx];
+                    }
+                    // Write to result
+                    size_t out_idx = result_offset +
+                                     i * result.strides_[result.ndim() - 2] +
+                                     j * result.strides_.back();
+                    (*result.data_)[out_idx] = sum;
+                }
             }
+        }
 
-            // Determine result shape: leading dimensions + [n, p]
-            vector<size_t> result_shapes = A_leading_shape;
-            result_shapes.push_back(n);
-            result_shapes.push_back(p);
+        return result;
+    }
 
-            Tensor<T> result(result_shapes, static_cast<T>(0));
+    /// @brief Transpose the tensor.
+    /// @details This function supports transposing 1D and 2D tensors.
+    /// 1D tensors are transposed from shape (1, n) to (n, 1).
+    /// For 2D tensors, it swaps rows and columns.
+    /// @return A new tensor that is the transpose of the original tensor.
+    /// @throws runtime_error if the tensor has more than 2 dimensions.
 
-            // Compute total number of batches (product of leading dimensions)
-            // may be we can use divisoin in stride to have O(1) time
-            size_t total_batches = 1;
-            for (const size_t& dim: A_leading_shape) {
-                total_batches *= dim;
-            }
+    Tensor<T> transpose(int64_t dim0 = -2, int64_t dim1 = -1) const
+    {
+        const size_t ndim = this->ndim();
 
-            for (size_t batch = 0; batch < total_batches; ++batch) {
-                // Get multi_dimensional indices for this batch
-                vector<size_t> indices = linear_to_multi_idxs(batch, A_leading_shape);
+        if (ndim == 1 && dim0 == -2 && dim1 == -1)
+        {
+            Tensor<T> result = *this;
+            return result.reshape({this->size(), 1});
+        }
 
-                // Compute offsets for A, B, and result
-                size_t A_offset = this->offset_;
-                size_t B_offset = other.offset_;
-                size_t result_offset = 0;
+        if (dim0 == dim1)
+        {
+            return *this; // No-op if dimensions are the same
+        }
 
-                for (size_t i = 0; i < A_leading_ndim; ++i) {
-                    A_offset += indices[i] * this->strides_[i];
-                    B_offset += indices[i] * other.strides_[i];
-                    result_offset += indices[i] * result.strides_[i];
-                }
+        if (dim0 < 0)
+        {
+            dim0 += ndim;
+        }
 
-                for (size_t i = 0; i < n; ++i) {
-                    for (size_t j = 0; j < p; ++j) {
-                        T sum = static_cast<T>(0);
-
-                        for (size_t k = 0; k < m; ++k) {
-                            // Calculate offsets in A and B
-                            size_t a_idx = A_offset + 
-                                        i * this->strides_[A_leading_ndim] + 
-                                        k * this->strides_[A_leading_ndim + 1];
-                            
-                            size_t b_idx = B_offset + 
-                                        k * other.strides_[B_leading_ndim] + 
-                                        j * other.strides_[B_leading_ndim + 1];
-                            
-                            sum += (*this->data_)[a_idx] * (*other.data_)[b_idx];
-                        }
-                        // Write to result
-                        size_t out_idx = result_offset + 
-                                        i * result.strides_[result.ndim() - 2] + 
-                                        j * result.strides_.back();
-                        (*result.data_)[out_idx] = sum;
-                    }
-                }
-            }
+        if (dim1 < 0)
+        {
+            dim1 += ndim;
+        }
 
-            return result;
+        if (dim0 < 0 || dim0 >= ndim || dim1 < 0 || dim1 >= ndim)
+        {
+            throw out_of_range("Transpose dimensions out of range");
         }
 
-        
-        /// @brief Transpose the tensor.
-        /// @details This function supports transposing 1D and 2D tensors.
-        /// 1D tensors are transposed from shape (1, n) to (n, 1).
-        /// For 2D tensors, it swaps rows and columns.
-        /// @return A new tensor that is the transpose of the original tensor.
-        /// @throws runtime_error if the tensor has more than 2 dimensions.
+        // Create new tensor with swapped dimensions
+        Tensor<T> result = *this;
+        swap(result.shape_[dim0], result.shape_[dim1]);
+        swap(result.strides_[dim0], result.strides_[dim1]);
 
-        Tensor<T> transpose(int64_t dim0=-2, int64_t dim1=-1) const {
-            const size_t ndim = this->ndim();
+        // cout << "result.shapes_: " << result.shapes_[0] << " " << result.shapes_[1] << endl;
+        // cout << "result.strides_: " << result.strides_[0] << " " << result.strides_[1] << endl;
 
-            if (ndim == 1 && dim0 == -2 && dim1 == -1) {
-                Tensor result = *this;
-                result.reshape({this->size(), 1});
-                return result;
-            }
+        return result;
+    }
 
-            if (dim0 == dim1) {
-                return *this; // No-op if dimensions are the same
-            }
+    template <typename... Dims>
+    Tensor<T> permute(Dims... dims) const
+    {
+        vector<size_t> perm_dims = {static_cast<size_t>(dims)...};
 
-            if (dim0 < 0) {
-                dim0 += ndim;
-            }
+        size_t ndim = this->ndim();
 
-            if (dim1 < 0) {
-                dim1 += ndim;
-            }
+        if (perm_dims.size() != ndim)
+        {
+            throw std::invalid_argument("Number of dimensions in permutation must match tensor's number of dimensions");
+        }
 
-            if (dim0 < 0 || dim0 >= ndim || dim1 < 0 || dim1 >= ndim) {
-                throw out_of_range("Transpose dimensions out of range");
+        unordered_set<size_t> seen_dims;
+        for (size_t dim : perm_dims)
+        {
+            if (dim >= ndim)
+            {
+                throw out_of_range("Permute dimension out of range");
+            }
+            if (seen_dims.count(dim))
+            {
+                throw invalid_argument("Duplicate dimension in permute");
             }
+            seen_dims.insert(dim);
+        }
 
-            // Create new tensor with swapped dimensions
-            Tensor<T> result = *this;
-            swap(result.shapes_[dim0], result.shapes_[dim1]);
-            swap(result.strides_[dim0], result.strides_[dim1]);
+        vector<size_t> new_shapes(ndim);
+        vector<size_t> new_strides(ndim);
 
-            cout << "result.shapes_: " << result.shapes_[0] << " " << result.shapes_[1] << endl;
-            cout << "result.strides_: " << result.strides_[0] << " " << result.strides_[1] << endl;
+        size_t i = 0;
+        for (size_t dim : perm_dims)
+        {
+            if (dim >= ndim)
+            {
+                throw std::out_of_range("Permutation dimension out of range");
+            }
 
-            return result;
+            new_shapes[i] = this->shape_[dim];
+            new_strides[i] = this->strides_[dim];
+            ++i;
         }
 
-        /// @brief Flatten the tensor into 1D in-place.
-        /// @details This function only changes the shape of the tensor, and does not modify the underlying data.
-        /// @post The shape of the tensor is changed to 1D, with the same elements as the original tensor.
-        void flatten() {
-            this->shapes_ = { this->size() };
-            this->calculate_strides();
-            return;
+        Tensor<T> result = *this;
+        result.shape_ = new_shapes;
+        result.strides_ = new_strides;
+
+        return result;
+    }
+
+    /**
+     * Flattens the dimensions of the tensor from start_dim to end_dim into a single dimension.
+     *
+     * This function collapses multiple dimensions of the tensor into one, effectively reducing
+     * the number of dimensions by merging the specified range of dimensions. If start_dim or
+     * end_dim is negative, it will be counted from the last dimension backwards. The resulting
+     * tensor will have the same total number of elements as the original tensor.
+     *
+     * @param start_dim The starting dimension index to begin flattening. Defaults to 0.
+     * @param end_dim The ending dimension index to stop flattening. Defaults to -1, which
+     *                refers to the last dimension.
+     * @return A new tensor with the specified dimensions flattened.
+     *
+     * @throws std::invalid_argument if start_dim is greater than end_dim.
+     * @throws std::out_of_range if start_dim or end_dim is out of the range of the tensor's dimensions.
+     */
+
+    Tensor<> flatten(int64_t start_dim = 0, int64_t end_dim = -1) const
+    {
+        if (start_dim < 0)
+        {
+            start_dim += this->ndim();
+        }
+        if (end_dim < 0)
+        {
+            end_dim += this->ndim();
         }
 
-        /// @brief Calculate the absolute value of each element in the tensor
-        /// @return a new tensor with the same shape as the original, but with each element replaced by its absolute value
-        Tensor<T> abs() const {
-            Tensor<T> result(this->shapes_, static_cast<T>(0));
-
-            for (size_t i = 0; i < this->size(); i++) {
-                (*result.data_)[i] = std::abs((*this->data_)[i]);
-            }
+        if (start_dim > end_dim)
+        {
+            throw invalid_argument("Start dimension must be less than or equal to end dimension");
+        }
 
-            return result;
+        if (start_dim < 0 || start_dim >= this->ndim() || end_dim < 0 || end_dim >= this->ndim())
+        {
+            throw out_of_range("Flatten dimensions out of range");
         }
 
-        /// @brief Filter the tensor with the given function
-        /// @param func a function to test each element of the tensor. It should return true if the element passes the test
-        /// @return a new tensor with the same shape as the original, but all elements that fail the test are set to 0.
-        Tensor<T> filter(bool (*func)(T)) const {
-            Tensor<T> result(this->shapes_, static_cast<T>(0));
+        vector<size_t> new_shape;
+        new_shape.reserve(this->ndim() - (end_dim - start_dim + 1) + 1);
 
-            for (size_t i = 0; i < this->size(); i++) {
-                if (func((*this->data_)[i])) {
-                    (*result.data_)[i] = (*this->data_)[i];
-                }
+        for (size_t i = 0; i < this->ndim(); ++i)
+        {
+            if (i <= start_dim || i > end_dim)
+            {
+                new_shape.push_back(this->shape_[i]);
+            }
+            else
+            {
+                new_shape[new_shape.size() - 1] *= this->shape_[i];
             }
-
-            return result;
         }
 
-        /// @brief Perform element-wise transformation with a function
-        /// @param func a function to perform element-wise transformation to the tensor
-        /// @return a new tensor with the same shape as the original, but with each element transformed by the given func
-        Tensor<T> map(T (*func)(T)) const {
-            Tensor<T> result(this->shapes_, static_cast<T>(0));
+        return this->reshape(new_shape);
+    }
 
-            for (size_t i = 0; i < this->size(); i++) {
-                (*result.data_)[i] = func((*this->data_)[i]);
-            }
+    /// @brief Calculate the absolute value of each element in the tensor
+    /// @return a new tensor with the same shape as the original, but with each element replaced by its absolute value
+    Tensor<T> abs() const
+    {
+        Tensor<T> result = *this;
 
-            return result;
+        for (size_t i = 0; i < this->size(); i++)
+        {
+            (*result.data_)[i] = std::abs((*this->data_)[i]);
         }
 
-        /// @brief Calculate the sum of all elements in the tensor
-        /// @return The sum of all elements in the tensor, regardless of the dimension
-        T sum() const {
-            T sum = static_cast<T>(0);
-
-            for (size_t i = 0; i < this->size(); i++) {
-                sum += (*this->data_)[i];
+        return result;
+    }
+
+    /// @brief Filter the tensor with the given function
+    /// @param func a function to test each element of the tensor. It should return true if the element passes the test
+    /// @return a new tensor with the same shape as the original, but all elements that fail the test are set to 0.
+    Tensor<T> filter(bool (*func)(T)) const
+    {
+        Tensor<T> result = *this;
+
+        for (size_t i = 0; i < this->size(); i++)
+        {
+            if (!func((*this->data_)[i]))
+            {
+                (*result.data_)[i] = static_cast<T>(0);
             }
-            
-            return sum;
         }
 
-        /// @brief Check if all elements of two tensors are equal
-        /// @param other Tensor to compare
-        /// @return Tensor of integers where each element is 1 if the two tensors are equal at the same index, 0 otherwise
-        Tensor<int> equal(const Tensor<T>& other) const{
-            if (other.shapes_ != this->shapes_) {
-                throw runtime_error("Shape mismatch");
-            }
-
-            Tensor<T> result(this->shapes_, static_cast<T>(0));
-            const vector<size_t>& result_strides = result.strides_;
+        return result;
+    }
 
-            for (size_t i = 0; i < this->size(); i++) {
-                auto [a_offset, b_offset] = calculate_tensors_offsets(i, this->ndim(), result_strides, other);
+    /// @brief Perform element-wise transformation with a function
+    /// @param func a function to perform element-wise transformation to the tensor
+    /// @return a new tensor with the same shape as the original, but with each element transformed by the given func
+    Tensor<T> map(T (*func)(T)) const
+    {
+        Tensor<T> result = *this;
 
-                (*result.data_)[i] = (*this->data_)[a_offset] == (*other.data_)[b_offset];
-            }
-
-            return result.dtype<int>();
+        for (size_t i = 0; i < this->size(); i++)
+        {
+            (*result.data_)[i] = func((*this->data_)[i]);
         }
 
-        /// @brief Check if all elements of two tensors are equal
-        /// @param other Tensor to compare
-        /// @return true if all elements are equal, false otherwise
-        bool compare(const Tensor<T>& other) const {
-            if (other.shapes_ != this->shapes_) {
-                throw runtime_error("Shape mismatch");
-            }
+        return result;
+    }
 
-            for (size_t i = 0; i < this->size(); i++) {
-                auto [a_offset, b_offset] = calculate_tensors_offsets(i, this->ndim(), this->strides_, other);
+    /// @brief Calculate the sum of all elements in the tensor
+    /// @return The sum of all elements in the tensor, regardless of the dimension
+    T sum() const
+    {
+        T sum = static_cast<T>(0);
 
-                if ((*this->data_)[a_offset] != (*other.data_)[b_offset]) {
-                    return false;
-                }
-            }
-            return true;
+        for (size_t i = 0; i < this->size(); i++)
+        {
+            sum += (*this->data_)[i];
         }
 
-        /// @brief Reduce the tensor to the maximum value of all elements
-        /// @return a tensor with a single element, the maximum of all elements in the tensor
-        inline Tensor<> max() const {
-            return reduce_impl(ReduceOp::MAX);
+        return sum;
+    }
+
+    /// @brief Check if all elements of two tensors are equal
+    /// @param other Tensor to compare
+    /// @return Tensor of integers where each element is 1 if the two tensors are equal at the same index, 0 otherwise
+    Tensor<int> equal(const Tensor<T> &other) const
+    {
+        if (other.shape_ != this->shape_)
+        {
+            throw runtime_error("Shape mismatch");
         }
 
+        Tensor<T> result(this->shape_, static_cast<T>(0));
+        const vector<size_t> &result_strides = result.strides_;
+
+        for (size_t i = 0; i < this->size(); i++)
+        {
+            auto [a_offset, b_offset] = calculate_tensors_offsets(i, this->ndim(), result_strides, other);
 
-        /// @brief Reduce the tensor to the indices of the maximum values along each row
-        /// @return a tensor with indices of the maximum values for each row
-        inline Tensor<size_t> argmax() const {
-            return reduce_impl<size_t>(ReduceOp::ARGMAX);
+            (*result.data_)[i] = (*this->data_)[a_offset] == (*other.data_)[b_offset];
         }
 
-        /// @brief Reduce the tensor to the minimum value of all elements
-        /// @return a tensor with a single element, the minimum of all elements in the tensor
-        inline Tensor<> min() const {
-            return reduce_impl(ReduceOp::MIN);
+        return result.dtype<int>();
+    }
+
+    /// @brief Check if all elements of two tensors are equal
+    /// @param other Tensor to compare
+    /// @return true if all elements are equal, false otherwise
+    bool compare(const Tensor<T> &other) const
+    {
+        if (other.shape_ != this->shape_)
+        {
+            throw runtime_error("Shape mismatch");
         }
 
-        /// @brief Reduce the tensor to the indices of the minimum values along each row
-        /// @return a tensor with indices of the minimum values for each row
-        inline Tensor<size_t> argmin() const {
-            return reduce_impl<size_t>(ReduceOp::ARGMIN);
+        for (size_t i = 0; i < this->size(); i++)
+        {
+            auto [a_offset, b_offset] = calculate_tensors_offsets(i, this->ndim(), this->strides_, other);
+
+            if ((*this->data_)[a_offset] != (*other.data_)[b_offset])
+            {
+                return false;
+            }
         }
+        return true;
+    }
+
+    /// @brief Reduce the tensor to the maximum value of all elements
+    /// @return a tensor with a single element, the maximum of all elements in the tensor
+    inline Tensor<> max() const
+    {
+        return reduce_impl(ReduceOp::MAX);
+    }
+
+    /// @brief Reduce the tensor to the indices of the maximum values along each row
+    /// @return a tensor with indices of the maximum values for each row
+    inline Tensor<size_t> argmax() const
+    {
+        return reduce_impl<size_t>(ReduceOp::ARGMAX);
+    }
+
+    /// @brief Reduce the tensor to the minimum value of all elements
+    /// @return a tensor with a single element, the minimum of all elements in the tensor
+    inline Tensor<> min() const
+    {
+        return reduce_impl(ReduceOp::MIN);
+    }
+
+    /// @brief Reduce the tensor to the indices of the minimum values along each row
+    /// @return a tensor with indices of the minimum values for each row
+    inline Tensor<size_t> argmin() const
+    {
+        return reduce_impl<size_t>(ReduceOp::ARGMIN);
+    }
+
+    /// @brief Convert the tensor to a tensor of a different type.
+    /// @details If U is not provided, it defaults to double.
+    /// @param U the type to convert to
+    /// @return a tensor with the same shape and data, but with the type U
+    template <typename U = double>
+    Tensor<U> dtype() const
+    {
+        return dtype_impl<T, U>(*this);
+    }
+
+    /// @brief Reshape the tensor to the specified new shape.
+    /// @details This function changes the shape of the tensor without altering the data.
+    /// The total number of elements must remain the same; otherwise, an exception is thrown.
+    /// @param new_shape The desired shape for the tensor.
+    /// @throws runtime_error if the new shape is not compatible with the current number of elements.
+    Tensor<> reshape(const vector<size_t> &new_shape) const
+    {
+        // Calculate total elements for both shapes
+        const int64_t current_elements = accumulate(
+            this->shape_.begin(), this->shape_.end(), 1, multiplies<size_t>());
+        const int64_t new_elements = accumulate(
+            new_shape.begin(), new_shape.end(), 1, multiplies<size_t>());
+
+        if (current_elements != new_elements)
+        {
+            throw runtime_error("New shape must be compatible with the original shape");
+        }
+
+        // Check if the data is stored in a contiguous way
+        vector<size_t> original_strides(this->ndim(), 0);
+        int64_t stride = 1;
 
-        
-        /// @brief Convert the tensor to a tensor of a different type.
-        /// @details If U is not provided, it defaults to double.
-        /// @param U the type to convert to
-        /// @return a tensor with the same shape and data, but with the type U
-        template<typename U = double>
-        Tensor<U> dtype() const {
-            return dtype_impl<T, U>(*this);
+        for (int64_t i = this->ndim() - 1; i >= 0; --i)
+        {
+            original_strides[i] = stride;
+            stride *= this->shape_[i];
         }
 
-        /// @brief Reshape the tensor to the specified new shape.
-        /// @details This function changes the shape of the tensor without altering the data.
-        /// The total number of elements must remain the same; otherwise, an exception is thrown.
-        /// @param new_shape The desired shape for the tensor.
-        /// @throws runtime_error if the new shape is not compatible with the current number of elements.
-        void reshape(const vector<size_t>& new_shape) {
-            size_t new_size = 1;
-            for (const size_t& dim : new_shape) {
-                new_size *= dim;
-            }
+        Tensor<T> result;
 
-            if (new_size != this->size()) {
-                throw runtime_error("New shape must be compatible with the original shape");
-            }
+        // If the data is not stored in a contiguous way, the stride will not be a cumulative product of the shape
+        if (original_strides != this->strides_)
+        {
+            cout << "Clone the tensor" << endl;
+            /*
+            This part is a little bit complicated
 
-            this->shapes_ = new_shape;
-            this->calculate_strides();
+            Since the data may not store in a contiguous way, there is a problem when we directly change the shape of the tensor.
+            We will loss the tracking of the strides of the tensor.
 
-            return;
-        }
+            Therefore we have to make the data stored in a contiguous way first, then we can change the shape of the tensor.
+            clone() function will create a new tensor with the same shape and data as the current tensor.
 
-        /// @brief Return a deep copy of the tensor. Actually the same as the copy constructor.
-        /// @details This function will create a new tensor with the same shape and data as the current tensor.
-        /// @return a new tensor which is a deep copy of the current tensor
-        Tensor<T> clone() const {
-            Tensor<T> result = *this;
+            If we directly use copy constructor, the data will not be stored in a contiguous way.
+            Since I don't rearange the data in the copy constructor.
 
-            return result;
-        }
+            Eventully the tensor data is guaranteed to be stored in the contiguous way, so we can directly change the shape of the tensor.
+            */
 
-        vector<T> to_vector() const { return (*this->data_); }
-        
-        // Get the dimension of the tensor
-        inline size_t ndim() const {
-            return this->shapes_.size();
+            // Create a new tensor with contiguous data
+            result = this->clone();
+        }
+        else
+        {
+            // the data is not stored in a contiguous way
+            result = *this;
         }
 
-        const size_t size() const {
-            size_t n = 1;
-            for (const size_t& s : this->shapes_) {
-                n *= s;
+        result.shape_ = new_shape;
+        result.compute_contiguous_strides();
+
+        return result;
+    }
+
+    /// @brief Return a deep copy of the tensor. The data is copied to a new contiguous storage (and this is the only difference from copy constructor).
+    /// @details This function will create a new tensor with the same shape and data as the current tensor.
+    /// @return a new tensor which is a deep copy of the current tensor
+    Tensor<T> clone() const
+    {
+        Tensor<T> result;
+
+        result.shape_ = this->shape_;
+        result.data_ = make_shared<vector<T>>(*(this->data_));
+        result.compute_contiguous_strides();
+
+        // Copy data from original tensor's view to the new contiguous storage
+        for (size_t i = 0; i < this->size(); ++i)
+        {
+            vector<size_t> indices = linear_to_multi_idxs(i, result.shape_);
+            size_t src_offset = this->offset_;
+
+            for (size_t dim = 0; dim < indices.size(); ++dim)
+            {
+                src_offset += indices[dim] * this->strides_[dim];
             }
-            return n;
-        }
 
-        /// @brief Print the tensor to console.
-        /// @details This function will print the tensor in a nested array style.
-        void print() const { 
-            print_recursive_impl(0, 0, 0); 
-            cout << endl; // flush the output
-            return;
+            (*result.data_)[i] = (*this->data_)[src_offset];
         }
 
-        inline const vector<size_t>& shapes() const { return this->shapes_; }
+        return result;
+    }
 
+    static Tensor<T> arange(size_t start, size_t end = 0, vector<size_t> shape = {0})
+    {
+        if (start == end) // if only one argument is provided
+        {
+            throw runtime_error("arange() missing required argument: 'end'");
+        }
+        if (end == 0)
+        {
+            end = start;
+            start = 0;
+        }
 
-        // ========================================operators overloading========================================
-        inline Tensor<T> operator+(const Tensor<T>& other) const { return this->add(other); }
-        inline Tensor<T> operator-(const Tensor<T>& other) const { return this->sub(other); }
-        inline Tensor<T> operator*(const Tensor<T>& other) const { return this->mul(other); }
-        inline Tensor<T> operator*(const T& scaler) const { return this->mul(scaler); }
-        inline bool operator==(const Tensor<T>& other) const { return this->compare(other); }
+        if (shape.size() == 1 && shape[0] <= 0)
+        {
+            shape[0] = end - start + 1;
+        }
 
-        Tensor<T>& operator=(const Tensor<T>& other) { 
-            if (this == &other) return *this;
+        Tensor<T> result(shape, static_cast<T>(0));
 
-            this->shapes_ = other.shapes_;
-            this->data_ = make_shared<vector<T>>(*(other.data_));
-            this->calculate_strides();
+        cout << "In arange, weight address: " << &result.data_ << endl;
 
-            // Copy data from original tensor's view to the new contiguous storage
-            for (size_t i = 0; i < this->size(); ++i) {
-                vector<size_t> indices = linear_to_multi_idxs(i, this->shapes_);
-                size_t src_offset = other.offset_;
+        size_t idx = 0;
+        for (size_t i = start; i <= end; i++)
+        {
+            (*result.data_)[idx] = static_cast<T>(i);
+            idx++;
+        }
 
-                for (size_t dim = 0; dim < indices.size(); ++dim) {
-                    src_offset += indices[dim] * other.strides_[dim];
-                }
+        return result;
+    }
 
-                (*this->data_)[i] = (*other.data_)[src_offset];
-            }
+    // Get the dimension of the tensor
+    inline size_t ndim() const
+    {
+        return this->shape_.size();
+    }
 
-            return *this;
+    const size_t size() const
+    {
+        if (this->offset_ == 0)
+        {
+            return this->data_->size();
         }
 
-        const Tensor<T> operator+=(const Tensor<T>& other) { 
-            *this = *this + other;
-            return *this;
+        if (this->size_ != -1)
+        {
+            return this->size_;
         }
 
-        const Tensor<T> operator-=(const Tensor<T>& other) {
-            *this = *this - other;
-            return *this;
+        this->size_ = 1;
+        for (const size_t &s : this->shape_)
+        {
+            this->size_ *= s;
         }
 
-        const Tensor<T> operator*=(const Tensor<T>& other) {
-            *this = *this * other;
+        return this->size_;
+    }
+
+    /// @brief Print the tensor to console.
+    /// @details This function will print the tensor in a nested array style.
+    void print() const
+    {
+        print_recursive_impl(0, 0, 0);
+        cout << endl; // flush the output
+        return;
+    }
+
+    inline const vector<size_t> &shapes() const { return this->shape_; }
+
+    // ========================================operators overloading========================================
+    inline Tensor<T> operator+(const Tensor<T> &other) const { return this->add(other); }
+    inline Tensor<T> operator-(const Tensor<T> &other) const { return this->sub(other); }
+    inline Tensor<T> operator*(const Tensor<T> &other) const { return this->mul(other); }
+    inline Tensor<T> operator*(const T &scaler) const { return this->mul(scaler); }
+    inline Tensor<T> operator/(const Tensor<T> &other) const { return this->div(other); }
+    inline Tensor<T> operator/(const T &scaler) const { return this->div(scaler); }
+    inline bool operator==(const Tensor<T> &other) const { return this->compare(other); }
+
+    /*
+    Instead of returning a new tensor, we modify the current tensor in place.
+
+    Besides, it is slightly different from method clone(), in which it will not modify data_ to make all the elements stored contiguously.
+    */
+    Tensor<T> &operator=(const Tensor<T> &other)
+    {
+        if (this == &other)
             return *this;
-        }
 
-        const Tensor<T> operator*=(const T& other) {
-            *this = *this * other;
-            return *this;
+        this->shape_ = other.shape_;
+        this->data_ = make_shared<vector<T>>(*(other.data_));
+        this->strides_ = other.strides_;
+        this->offset_ = other.offset_;
+        this->size_ = other.size_;
+
+        return *this;
+    }
+
+    const Tensor<T> operator+=(const Tensor<T> &other)
+    {
+        *this = *this + other;
+        return *this;
+    }
+
+    const Tensor<T> operator-=(const Tensor<T> &other)
+    {
+        *this = *this - other;
+        return *this;
+    }
+
+    const Tensor<T> operator*=(const Tensor<T> &other)
+    {
+        *this = *this * other;
+        return *this;
+    }
+
+    const Tensor<T> operator*=(const T &other)
+    {
+        *this = *this * other;
+        return *this;
+    }
+
+    const Tensor<T> operator/=(const Tensor<T> &other)
+    {
+        *this = *this / other;
+        return *this;
+    }
+
+    const Tensor<T> operator/=(const T &other)
+    {
+        *this = *this / other;
+        return *this;
+    }
+
+    // lvalue operator overloading
+    template <typename... Indices>
+    T &operator[](Indices... indices)
+    {
+        vector<size_t> idxs = this->get_idxs(indices...);
+        return (*this->data_)[this->calculate_idx(idxs)];
+    }
+
+    // Using vector to index the tensor
+    T &operator[](const vector<size_t> &indices)
+    {
+        return (*this->data_)[this->calculate_idx(indices)];
+    }
+
+    // rvalue operator overloading
+    template <typename... Indices>
+    const T &operator[](Indices... indices) const
+    {
+        vector<size_t> idxs = this->get_idxs(indices...);
+        return (*this->data_)[this->calculate_idx(idxs)];
+    }
+
+    // Using vector to index the tensor
+    const T &operator[](const vector<size_t> &indices) const
+    {
+        return (*this->data_)[this->calculate_idx(indices)];
+    }
+
+    /**
+     * @brief Advanced indexing using a combination of integers, strings, and slices.
+     *
+     * This function allows for flexible indexing into the tensor, similar to Python's
+     * advanced indexing. It supports integer indices, string-based slices, and the ellipsis
+     * ("...") for automatic dimension completion. The function expands slices and handles
+     * ellipsis to generate the appropriate sub-tensor.
+     *
+     * @param indices A vector of indices where each index can be an integer, a string
+     *                representing a slice, or a special ellipsis ("...").
+     * @return A new tensor that is indexed from the current tensor according to the given indices.
+     *
+     * @throw std::invalid_argument if an index type is invalid or if more than one ellipsis is used.
+     */
+    using IndexType = variant<size_t, string, Slice>;
+    Tensor<T> index(const vector<IndexType> &indices) const
+    {
+        vector<vector<size_t>> expanded_indices;
+
+        // Handle ellipsis and expand slices
+        // cout << "Start expanding indices" << endl;
+        for (size_t i = 0; i < indices.size(); ++i)
+        {
+            const auto &idx = indices[i];
+
+            if (auto str_idx = get_if<string>(&idx))
+            {
+                Slice slice = Slice::parse(*str_idx);
+                expanded_indices.push_back(apply_slice(slice, this->shape_[i]));
+            }
+            else if (auto int_idx = get_if<size_t>(&idx))
+            {
+                expanded_indices.push_back({normalize_index(*int_idx, this->shape_[i])});
+            }
+            else if (auto slice_idx = get_if<Slice>(&idx))
+            {
+                expanded_indices.push_back(apply_slice(*slice_idx, this->shape_[i]));
+            }
+            else
+            {
+                throw std::invalid_argument("Invalid index type");
+            }
         }
 
-        // lvalue operator overloading
-        template<typename... Indices>
-        T& operator[](Indices... indices) {
-            vector<size_t> idxs = this->get_idxs(indices...);
-            return (*this->data_)[this->calculate_idx(idxs)]; 
-        }
-
-        T& operator[](const vector<size_t>& indices) {
-            return (*this->data_)[this->calculate_idx(indices)]; 
-        }
-
-        // rvalue operator overloading
-        template<typename... Indices>
-        const T& operator[](Indices... indices) const {
-            vector<size_t> idxs = this->get_idxs(indices...);
-            return (*this->data_)[this->calculate_idx(idxs)]; 
-        }
-
-        const T& operator[](const vector<size_t>& indices) const {
-            return (*this->data_)[this->calculate_idx(indices)]; 
-        }
-
-        /**
-         * @brief Advanced indexing using a combination of integers, strings, and slices.
-         * 
-         * This function allows for flexible indexing into the tensor, similar to Python's
-         * advanced indexing. It supports integer indices, string-based slices, and the ellipsis
-         * ("...") for automatic dimension completion. The function expands slices and handles
-         * ellipsis to generate the appropriate sub-tensor.
-         * 
-         * @param indices A vector of indices where each index can be an integer, a string
-         *                representing a slice, or a special ellipsis ("...").
-         * @return A new tensor that is indexed from the current tensor according to the given indices.
-         * 
-         * @throw std::invalid_argument if an index type is invalid or if more than one ellipsis is used.
-         */
-        using IndexType = variant<size_t, string, Slice>;
-        Tensor<T> index(const vector<IndexType>& indices) const {
-            vector<vector<size_t>> expanded_indices;
-            
-            // Handle ellipsis and expand slices
-            // cout << "Start expanding indices" << endl;
-            for (size_t i = 0; i < indices.size(); ++i) {
-                const auto& idx = indices[i];
-
-                if (auto str_idx = get_if<string>(&idx)) {
-                    Slice slice = Slice::parse(*str_idx);
-                    expanded_indices.push_back(apply_slice(slice, this->shapes_[i]));
-                } 
-                else if (auto int_idx = get_if<size_t>(&idx)) {
-                    expanded_indices.push_back({normalize_index(*int_idx, this->shapes_[i])});
-                } 
-                else if (auto slice_idx = get_if<Slice>(&idx)) {
-                    expanded_indices.push_back(apply_slice(*slice_idx, this->shapes_[i]));
-                }
-                else {
-                    throw std::invalid_argument("Invalid index type");
+        // Calculate new dimensions
+        vector<size_t> new_dims;
+        for (const vector<size_t> &expanded_idx : expanded_indices)
+        {
+            if (expanded_idx[0] != -1)
+            { // Not None/newaxis
+                if (expanded_idx.size() > 1)
+                {
+                    new_dims.push_back(expanded_idx.size());
                 }
             }
-            
-            // Calculate new dimensions
-            vector<size_t> new_dims;
-            for (const vector<size_t>& expanded_idx : expanded_indices) {
-                if (expanded_idx[0] != -1) { // Not None/newaxis
-                    if (expanded_idx.size() > 1) {
-                        new_dims.push_back(expanded_idx.size());
-                    }
-                } 
-                else {
-                    new_dims.push_back(1);
-                }
+            else
+            {
+                new_dims.push_back(1);
             }
+        }
 
-            // cout << "Start printing new_dims" << endl;
-            // cout << "new_dims size: " << new_dims.size() << endl;
-            // for (size_t i = 0; i < new_dims.size(); ++i) {
-            //     cout << new_dims[i] << " ";
-            // }
-            
-            // Create result tensor
-            Tensor<T> result(new_dims, static_cast<T>(0));
-            
-            // Fill result tensor
-            vector<size_t> current_indices(expanded_indices.size());
-            vector<size_t> result_indices;
-            
-            // Recursive lambda to fill result tensor
-            function<void(size_t)> fill_tensor = [&](size_t depth) {
-                if (depth == expanded_indices.size()) {
-                    result_indices.clear();
-                    for (int i = 0; i < expanded_indices.size(); ++i) {
-                        if (expanded_indices[i][0] != -1 && expanded_indices[i].size() > 1) {
-                            result_indices.push_back(current_indices[i]);
-                        }
-                    }
-                    
-                    vector<size_t> original_indices;
-                    for (int i = 0; i < expanded_indices.size(); ++i) {
-                        if (expanded_indices[i][0] != -1) {
-                            original_indices.push_back(expanded_indices[i][current_indices[i]]);
-                        }
+        // cout << "Start printing new_dims" << endl;
+        // cout << "new_dims size: " << new_dims.size() << endl;
+        // for (size_t i = 0; i < new_dims.size(); ++i) {
+        //     cout << new_dims[i] << " ";
+        // }
+
+        // Create result tensor
+        Tensor<T> result(new_dims, static_cast<T>(0));
+
+        // Fill result tensor
+        vector<size_t> current_indices(expanded_indices.size());
+        vector<size_t> result_indices;
+
+        // Recursive lambda to fill result tensor
+        function<void(size_t)> fill_tensor = [&](size_t depth)
+        {
+            if (depth == expanded_indices.size())
+            {
+                result_indices.clear();
+                for (int i = 0; i < expanded_indices.size(); ++i)
+                {
+                    if (expanded_indices[i][0] != -1 && expanded_indices[i].size() > 1)
+                    {
+                        result_indices.push_back(current_indices[i]);
                     }
-                    
-                    result[result_indices] = (*this)[original_indices];
-                    return;
                 }
-                
-                for (int i = 0; i < expanded_indices[depth].size(); ++i) {
-                    current_indices[depth] = i;
-                    fill_tensor(depth + 1);
+
+                vector<size_t> original_indices;
+                for (int i = 0; i < expanded_indices.size(); ++i)
+                {
+                    if (expanded_indices[i][0] != -1)
+                    {
+                        original_indices.push_back(expanded_indices[i][current_indices[i]]);
+                    }
                 }
-            };
-            
-            fill_tensor(0);
-            return result;
-        }
+
+                result[result_indices] = (*this)[original_indices];
+                return;
+            }
+
+            for (int i = 0; i < expanded_indices[depth].size(); ++i)
+            {
+                current_indices[depth] = i;
+                fill_tensor(depth + 1);
+            }
+        };
+
+        fill_tensor(0);
+        return result;
+    }
 };
\ No newline at end of file
diff --git a/include/modules/activations/softmax.hpp b/include/modules/activations/softmax.hpp
index 96542cc..9447211 100644
--- a/include/modules/activations/softmax.hpp
+++ b/include/modules/activations/softmax.hpp
@@ -10,6 +10,7 @@ class Softmax : public Module {
         Tensor<> softmax_helper(const Tensor<>& input);
         vector<double> softmax_helper(const vector<double>& input);
     public:
+        Softmax();
         Tensor<> forward(const Tensor<>& input);
         Tensor<> backward(const Tensor<>& grad_output);
         const Tensor<>& get_softmax_input_cache() const { return this->softmax_input_cache_; }
diff --git a/include/modules/layers/conv2d.hpp b/include/modules/layers/conv2d.hpp
index 2d68d66..1868be9 100644
--- a/include/modules/layers/conv2d.hpp
+++ b/include/modules/layers/conv2d.hpp
@@ -1,30 +1,50 @@
+#pragma once
+#include <utility>
 #include "module.hpp"
+#include "conv2d_utils.hpp"
 using namespace nn;
 
-namespace nn {
+namespace nn
+{
 
-class Conv2d : public Module {
+    class Conv2d : public Module
+    {
     public:
-        Conv2d(int in_channels, int out_channels, int kernel_size, int stride = 1, int padding = 0, int dilation = 1, bool bias = true);
-        virtual Tensor<> forward(const Tensor<>& input) override;
-        virtual Tensor<> backward(const Tensor<>& grad_output) override;
+        Conv2d(size_t in_channels,
+               size_t out_channels,
+               var_pair kernel_size,
+               var_pair stride = (size_t)1,
+               var_pair padding = (size_t)0,
+               var_pair dilation = (size_t)1,
+               const string &padding_mode = "zeros",
+               bool bias = true);
+
+        virtual Tensor<> forward(const Tensor<> &input) override;
+        virtual Tensor<> backward(const Tensor<> &grad_output) override;
         virtual void update_params(const float lr) override;
 
-        Tensor<> convolution(const Tensor<>& input, const Tensor<> filter);
-        Tensor<> full_convolution(const Tensor<>& input, const Tensor<> filter);
-    
-    private:
-        int in_channels_;
-        int out_channels_;
-        int kernel_size_;
-        int stride_;
-        int padding_;
-        int dilation_;
-        bool bias_;
-        Tensor<> weights_;
-        Tensor<> biases_;
-        Tensor<> grad_weights_;
-        Tensor<> grad_biases_;
-};
+        void reset_parameters();
+
+        void set_weight(const Tensor<> &target_weight) { this->weight_ = target_weight; }
+        void set_bias(const Tensor<> &target_bias) { this->bias_ = target_bias; }
 
+        const Tensor<> &get_weight() const { return this->weight_; }
+        const Tensor<> &get_bias() const { return this->bias_; }
+
+    private:
+        size_t in_channels_;
+        size_t out_channels_;
+        size_tp2 kernel_size_;
+        size_tp2 stride_;
+        size_tp2 padding_;
+        size_tp2 dilation_;
+        bool use_bias_;
+        PaddingMode padding_mode_;
+        Padding padding_module_;
+        vector<size_t> original_input_shape_;
+        Tensor<> weight_;
+        Tensor<> bias_;
+        Tensor<> grad_weight_;
+        Tensor<> grad_bias_;
+    };
 }
\ No newline at end of file
diff --git a/include/modules/layers/flatten.hpp b/include/modules/layers/flatten.hpp
new file mode 100644
index 0000000..3a06f7a
--- /dev/null
+++ b/include/modules/layers/flatten.hpp
@@ -0,0 +1,22 @@
+#pragma once
+#include "module.hpp"
+
+namespace nn
+{
+
+    class Flatten : public Module
+    {
+    public:
+        Flatten(int64_t start_dim = 1, int64_t end_dim = -1);
+
+        virtual Tensor<> forward(const Tensor<> &input) override;
+        virtual Tensor<> backward(const Tensor<> &grad_output) override;
+        virtual void update_params(const float lr) override;
+
+    private:
+        int64_t start_dim_;
+        int64_t end_dim_;
+        vector<size_t> original_input_shape_;
+    };
+
+}
\ No newline at end of file
diff --git a/include/modules/layers/linear.hpp b/include/modules/layers/linear.hpp
index 5265488..036ed50 100644
--- a/include/modules/layers/linear.hpp
+++ b/include/modules/layers/linear.hpp
@@ -1,34 +1,36 @@
 #pragma once
 #include "module.hpp"
 
-namespace nn {
+namespace nn
+{
 
-class Linear : public Module{
+    class Linear : public Module
+    {
     public:
         Linear(size_t in_features, size_t out_features, bool bias);
-        
-        virtual Tensor<> forward(const Tensor<>& input) override;
-        virtual Tensor<> backward(const Tensor<>& grad_output) override;
+
+        virtual Tensor<> forward(const Tensor<> &input) override;
+        virtual Tensor<> backward(const Tensor<> &grad_output) override;
         virtual void update_params(const float lr) override;
 
-        void randomizeParams();
+        void reset_parameters();
 
         // setters
-        inline void set_weights(const Tensor<>& desiredWeights) { this->weights_ = desiredWeights; };
-        inline void set_biases(const Tensor<>& desiredBiases) { this->biases_ = desiredBiases; }
+        inline void set_weight(const Tensor<> &target_weight) { this->weight_ = target_weight; };
+        inline void set_bias(const Tensor<> &target_bias) { this->bias_ = target_bias; }
 
         // getters
-        inline const Tensor<>& getWeights() const { return this->weights_; }
-        inline const Tensor<>& getBiases() const { return this->biases_; }
+        inline const Tensor<> &get_weight() const { return this->weight_; }
+        inline const Tensor<> &get_bias() const { return this->bias_; }
 
     private:
         size_t in_features_;
         size_t out_features_;
-        bool bias_;
-        Tensor<> weights_;
-        Tensor<> biases_;
-        Tensor<> grad_weights_;
-        Tensor<> grad_biases_;
-    };  
+        bool use_bias_;
+        Tensor<> weight_;
+        Tensor<> bias_;
+        Tensor<> grad_weight_;
+        Tensor<> grad_bias_;
+    };
 
 }
\ No newline at end of file
diff --git a/include/utils/conv2d_utils.hpp b/include/utils/conv2d_utils.hpp
index 40e3f05..a53b9bd 100644
--- a/include/utils/conv2d_utils.hpp
+++ b/include/utils/conv2d_utils.hpp
@@ -1,4 +1,34 @@
 #include "tensor.hpp"
 using namespace std;
 
-Tensor<> rotate_kernel(const Tensor<>& kernel);
+using size_tp2 = std::pair<size_t, size_t>;
+using var_pair = std::variant<size_t, size_tp2>;
+
+enum class PaddingMode
+{
+    ZEROS,
+    REFLECT,
+    REPLICATE
+};
+
+class Padding
+{
+public:
+    Padding() = default;
+    Padding(size_tp2 padding, PaddingMode padding_mode) : padding_(padding), padding_mode_(padding_mode) {}
+    Tensor<> pad(const Tensor<> &input, const size_tp2 &padding) const;
+    Tensor<> zero_pad(const Tensor<> &input, const size_tp2 &padding) const;
+
+private:
+    size_tp2 padding_;
+    PaddingMode padding_mode_;
+};
+
+Tensor<>
+convolution(const size_tp2 &stride, const size_tp2 &dilation, const vector<size_t> &output_shape, const Tensor<> &input, const Tensor<> &kernel, const Tensor<> &bias, bool use_bias);
+
+const vector<size_t> calculate_output_shape(const vector<size_t> &input_shape, const int64_t out_channel, const size_tp2 &kernel_size, const size_tp2 &stride, const size_tp2 &padding, const size_tp2 &dilation);
+
+Tensor<> flip_vertical_and_horizontal(const Tensor<> &input);
+
+Tensor<> dilate_input(const Tensor<> &input, const size_tp2 &dilation);
\ No newline at end of file
diff --git a/include/utils/tensor_utils.hpp b/include/utils/tensor_utils.hpp
index efa2b35..94681e1 100644
--- a/include/utils/tensor_utils.hpp
+++ b/include/utils/tensor_utils.hpp
@@ -1,6 +1,8 @@
 #pragma once
 #include <iostream>
+#include <numeric>
 #include <vector>
+#include <initializer_list>
 #include <algorithm>
 #include <stdexcept>
 #include <variant>
@@ -10,7 +12,8 @@
 #include <climits>
 #include <cstdint>
 #include <type_traits>
-
+#include <memory>
+#include <unordered_set>
 
 using namespace std;
 
@@ -20,11 +23,12 @@ template <typename T>
 class Tensor;
 
 // Convert tensor to different data type
-template<typename U, typename V>
-Tensor<V> dtype_impl(const Tensor<U>& tensor);
+template <typename U, typename V>
+Tensor<V> dtype_impl(const Tensor<U> &tensor);
 
 // for max, min ,argmax, argmin reduction
-enum class ReduceOp {
+enum class ReduceOp
+{
     MAX,
     MIN,
     ARGMAX,
@@ -32,7 +36,8 @@ enum class ReduceOp {
 };
 
 // for add, subtract, multiply, divide
-enum class ArithmeticOp {
+enum class ArithmeticOp
+{
     ADD,
     SUB,
     MUL,
@@ -40,50 +45,65 @@ enum class ArithmeticOp {
 };
 
 // Slice struct to handle Python-like slicing
-struct Slice {
+struct Slice
+{
     int start;
     int stop;
     int step;
-    
+
     Slice(int start_ = 0, int stop_ = -1, int step_ = 1) : start(start_), stop(stop_), step(step_) {}
-    
-    static Slice parse(const string& slice_str);
+
+    static Slice parse(const string &slice_str);
 };
 
 // Helper function to convert negative indices to positive
 size_t normalize_index(int idx, size_t dim_size);
 
 // Helper function to apply slice to a dimension
-vector<size_t> apply_slice(const Slice& slice, size_t dim_size);
+vector<size_t> apply_slice(const Slice &slice, size_t dim_size);
 
-vector<size_t> linear_to_multi_idxs(size_t idx, const vector<size_t>& shape);
+// Helper function to calculate the offset of the tensor given a single index
+vector<size_t> linear_to_multi_idxs(size_t idx, const vector<size_t> &shape);
 
 // Type trait to check if a type is a std::vector
-template<typename>
-struct is_vector : public std::false_type {};
+template <typename>
+struct is_vector : public std::false_type
+{
+};
 
-template<typename T, typename A>
-struct is_vector<std::vector<T, A>> : public std::true_type {};
+template <typename T, typename A>
+struct is_vector<std::vector<T, A>> : public std::true_type
+{
+};
 
 // Type trait to check if a type is a std::vector
-template<typename>
-struct is_initializer_list : public std::false_type {};
+template <typename>
+struct is_initializer_list : public std::false_type
+{
+};
 
-template<typename T>
-struct is_initializer_list<std::initializer_list<T>> : public std::true_type {};
+template <typename T>
+struct is_initializer_list<std::initializer_list<T>> : public std::true_type
+{
+};
 
 // ================================================definition================================================
 
-template<typename U, typename V>
-Tensor<V> dtype_impl(const Tensor<U>& tensor) {
+template <typename U, typename V>
+Tensor<V> dtype_impl(const Tensor<U> &tensor)
+{
     Tensor<V> result;
-    result.shapes_ = tensor.shapes_;
+
+    result.shape_ = tensor.shape_;
+    result.data_ = make_shared<vector<V>>();
     result.data_->resize(tensor.data_->size());
-    
+    result.strides_ = tensor.strides_;
+    result.offset_ = tensor.offset_;
+    result.size_ = tensor.size_;
+
     std::transform(tensor.data_->begin(), tensor.data_->end(), result.data_->begin(),
-        [](const U& val) { return static_cast<V>(val); });
-    
-    result.calculate_strides();
-        
+                   [](const U &val)
+                   { return static_cast<V>(val); });
+
     return result;
 }
\ No newline at end of file
diff --git a/include/utils/utils.hpp b/include/utils/utils.hpp
index 644fb92..efdd871 100644
--- a/include/utils/utils.hpp
+++ b/include/utils/utils.hpp
@@ -1,6 +1,7 @@
+#pragma once
 #include <iomanip>
 #include <iostream>
 using namespace std;
 
-void print_training_stats(int batch, float loss, float accuracy);
-void print_training_stats_line(int batch, float loss, float accuracy);
\ No newline at end of file
+void print_stats(int batch, float loss, float accuracy);
+void print_stats_line(int batch, float loss, float accuracy);
\ No newline at end of file
diff --git a/src/datasets/mnist.cpp b/src/datasets/mnist.cpp
index add5f07..bb8b178 100644
--- a/src/datasets/mnist.cpp
+++ b/src/datasets/mnist.cpp
@@ -114,9 +114,7 @@ bool MNIST::read_labels(const string& path) {
 
 tuple<Tensor<>, Tensor<>> Batch::to_tensor() {
     Tensor<> data = this->batch_data;
+    Tensor<> labels = this->batch_labels;
 
-    Tensor<int> labels_int = this->batch_labels;
-    Tensor<> labels = labels_int.dtype<double>();
-    
     return make_tuple(data, labels);
 }
\ No newline at end of file
diff --git a/src/models/mlp.cpp b/src/models/mlp.cpp
index 8a5fda4..f1d0700 100644
--- a/src/models/mlp.cpp
+++ b/src/models/mlp.cpp
@@ -3,12 +3,15 @@
 #include "relu.hpp"
 #include "dropout.hpp"
 
-MLP::MLP(vector<size_t> layer_sizes, double dropout_p) {
+MLP::MLP(vector<size_t> layer_sizes, double dropout_p)
+{
     this->num_layers_ = layer_sizes.size();
 
-    for (size_t i = 0; i < this->num_layers_  - 1; i++) {
+    for (size_t i = 0; i < this->num_layers_ - 1; i++)
+    {
         this->layers_.push_back(new Linear(layer_sizes[i], layer_sizes[i + 1], true));
-        if (i < this->num_layers_ - 2) {
+        if (i < this->num_layers_ - 2)
+        {
             this->layers_.push_back(new ReLU());
             this->layers_.push_back(new Dropout(dropout_p));
         }
@@ -17,34 +20,42 @@ MLP::MLP(vector<size_t> layer_sizes, double dropout_p) {
 
 MLP::MLP(initializer_list<size_t> layer_sizes, double dropout_p) : MLP(vector<size_t>(layer_sizes), dropout_p) {}
 
-MLP::~MLP() {
-    for (Module* layer : this->layers_) {
+MLP::~MLP()
+{
+    for (Module *layer : this->layers_)
+    {
         delete layer;
     }
 }
 
-Tensor<> MLP::forward(const Tensor<>& input) {
+Tensor<> MLP::forward(const Tensor<> &input)
+{
     Tensor<> x = input;
 
-    for (Module* layer : this->layers_) {
+    for (Module *layer : this->layers_)
+    {
         x = layer->forward(x);
     }
 
     return x;
 }
 
-Tensor<> MLP::backward(const Tensor<>& grad_output) {
+Tensor<> MLP::backward(const Tensor<> &grad_output)
+{
     Tensor<> grad = grad_output;
 
-    for (int i = this->layers_.size() - 1; i >= 0; i--) {
+    for (int i = this->layers_.size() - 1; i >= 0; i--)
+    {
         grad = this->layers_[i]->backward(grad);
     }
 
     return grad;
 }
 
-void MLP::update_params(const float lr) {
-    for (Module* layer : this->layers_) {
+void MLP::update_params(const float lr)
+{
+    for (Module *layer : this->layers_)
+    {
         layer->update_params(lr);
     }
 
diff --git a/src/modules/activations/softmax.cpp b/src/modules/activations/softmax.cpp
index 2e2be8d..6fe51c2 100644
--- a/src/modules/activations/softmax.cpp
+++ b/src/modules/activations/softmax.cpp
@@ -2,21 +2,32 @@
 #include "softmax.hpp"
 using namespace nn;
 
-Tensor<> Softmax::softmax_helper(const Tensor<>& input) {
-    Tensor<> result = input.map([](double x) { return exp(x); });
+Softmax::Softmax()
+{
+    cout << "Starting Softmax" << endl;
+    cout << "Softmax initialized" << endl;
+}
+
+Tensor<> Softmax::softmax_helper(const Tensor<> &input)
+{
+    Tensor<> result = input.map([](double x)
+                                { return exp(x); });
     double sum = result.sum();
 
     return result * (1 / sum);
 }
 
-vector<double> Softmax::softmax_helper(const vector<double>& input) {
+vector<double> Softmax::softmax_helper(const vector<double> &input)
+{
     double sum = 0.0f;
     vector<double> result;
 
-    for (size_t i = 0; i < input.size(); i++) {
+    for (size_t i = 0; i < input.size(); i++)
+    {
         sum += exp(input[i]);
     }
-    for (size_t i = 0; i < input.size(); i++) {
+    for (size_t i = 0; i < input.size(); i++)
+    {
         result.push_back(exp(input[i]) / sum);
     }
 
@@ -24,21 +35,32 @@ vector<double> Softmax::softmax_helper(const vector<double>& input) {
 }
 
 // Only support 1D and 2D Tensors
-Tensor<> Softmax::forward(const Tensor<>& input) {
+Tensor<> Softmax::forward(const Tensor<> &input)
+{
     // In softmax case, we don't have to store the input as it is not used in the backward pass
     // Instead, we store the softmax(input)
 
-    if (input.ndim() == 1) {
+    if (input.ndim() == 1)
+    {
         return this->softmax_helper(input);
     }
 
+    // const size_t leading_ndim = input.ndim() - 2;
+
+    // vector<size_t> leading_shape(input.shapes().begin(), input.shapes().end() - 2);
+
+    // const size_t n = input.shapes()[leading_ndim];
+    // const size_t m = input.shapes()[leading_ndim + 1];
+
     vector<vector<double>> softmax_input;
 
-    for (size_t i = 0; i < input.shapes()[0]; i++) {
+    for (size_t i = 0; i < input.shapes()[0]; i++)
+    {
         vector<double> input_row;
         input_row.reserve(input.shapes()[1]);
 
-        for (size_t j = 0; j < input.shapes()[1]; j++) {
+        for (size_t j = 0; j < input.shapes()[1]; j++)
+        {
             input_row.push_back(input[i, j]);
         }
         softmax_input.push_back(this->softmax_helper(input_row));
@@ -49,7 +71,8 @@ Tensor<> Softmax::forward(const Tensor<>& input) {
     return this->softmax_input_cache_;
 }
 
-Tensor<> Softmax::backward(const Tensor<>& grad_output) {
+Tensor<> Softmax::backward(const Tensor<> &grad_output)
+{
     Tensor<> softmax_grad;
 
     return softmax_grad;
diff --git a/src/modules/layers/conv2d.cpp b/src/modules/layers/conv2d.cpp
index d1d44d0..5e7a87d 100644
--- a/src/modules/layers/conv2d.cpp
+++ b/src/modules/layers/conv2d.cpp
@@ -1,12 +1,235 @@
+#include <random>
+#include <cmath>
 #include "conv2d.hpp"
 using namespace nn;
 
-Conv2d::Conv2d(int in_channels, int out_channels, int kernel_size, int stride, int padding, int dilation, bool bias) {
+Conv2d::Conv2d(size_t in_channels,
+               size_t out_channels,
+               var_pair kernel_size,
+               var_pair padding,
+               var_pair stride,
+               var_pair dilation,
+               const string &padding_mode,
+               bool bias)
+{
     this->in_channels_ = in_channels;
     this->out_channels_ = out_channels;
-    this->kernel_size_ = kernel_size;
-    this->stride_ = stride;
-    this->padding_ = padding;
-    this->dilation_ = dilation;
-    this->bias_ = bias;
+    this->use_bias_ = bias;
+
+    // Helper lambda to process variant parameters
+    auto process_variant = [](auto &&arg) -> size_tp2
+    {
+        using T = std::decay_t<decltype(arg)>;
+        if constexpr (std::is_same_v<T, size_t>)
+        {
+            if (arg < 0)
+            {
+                throw std::invalid_argument("Negative kernel size, stride, padding, or dilation is not supported");
+            }
+            return {arg, arg};
+        }
+        else
+        {
+            static_assert(std::is_same_v<T, size_tp2>, "Unexpected type in variant");
+            return arg;
+        }
+    };
+
+    // Set kernel size, stride, padding, and dilation
+    this->kernel_size_ = std::visit(process_variant, kernel_size);
+    this->stride_ = std::visit(process_variant, stride);
+    this->padding_ = std::visit(process_variant, padding);
+    this->dilation_ = std::visit(process_variant, dilation);
+
+    // cout << "Kernel Size : " << this->kernel_size_.first << ", " << this->kernel_size_.second << endl;
+    // cout << "Stride : " << this->stride_.first << ", " << this->stride_.second << endl;
+    // cout << "Padding : " << this->padding_.first << ", " << this->padding_.second << endl;
+    // cout << "Dilation : " << this->dilation_.first << ", " << this->dilation_.second << endl;
+
+    // Check if padding mode is valid
+    unordered_map<string, PaddingMode> all_padding_modes = {{"zeros", PaddingMode::ZEROS}, {"reflect", PaddingMode::REFLECT}, {"replicate", PaddingMode::REPLICATE}};
+
+    if (all_padding_modes.find(padding_mode) == all_padding_modes.end())
+    {
+        throw std::invalid_argument("Padding mode must be one of 'zeros', 'reflect', or 'replicate'");
+    }
+
+    // Set padding mode
+    this->padding_mode_ = all_padding_modes[padding_mode];
+    this->padding_module_ = Padding(this->padding_, this->padding_mode_);
+
+    // Initialize weights and bias
+    vector<size_t> weight_shape = {this->out_channels_, this->in_channels_, this->kernel_size_.first, this->kernel_size_.second};
+
+    this->weight_ = Tensor<>(weight_shape, 0.0);
+
+    if (this->use_bias_)
+    {
+        vector<size_t> bias_shape = {this->out_channels_};
+        this->bias_ = Tensor<>(bias_shape, 0.0);
+    }
+
+    // randomize the weights and bias based on PyTorch implementation
+    this->reset_parameters();
+}
+
+Tensor<> Conv2d::forward(const Tensor<> &input)
+{
+    Tensor<> input_data = input;
+    this->original_input_shape_ = input.shapes();
+
+    const vector<size_t> &output_shape = calculate_output_shape(input.shapes(), this->out_channels_, this->kernel_size_, this->stride_, this->padding_, this->dilation_);
+
+    if (this->padding_.first > 0 && this->padding_.second > 0)
+    {
+        input_data = this->padding_module_.pad(input_data, this->padding_);
+    }
+
+    // this input is the padded version of the original input
+    this->input_cache_ = input_data;
+
+    return convolution(this->stride_, this->dilation_, output_shape, input_data, this->weight_, this->bias_, this->use_bias_);
+}
+
+Tensor<> Conv2d::backward(const Tensor<> &grad_output)
+{
+    // dL_dY = grad_output
+
+    // dL_dW = conv(input_data, dL_dY)
+    Tensor<> permuted_input = this->input_cache_.permute(1, 0, 2, 3);
+    Tensor<> permuted_grad_output = grad_output.permute(1, 0, 2, 3);
+
+    // The grad weight shape is initially permuted
+    const vector<size_t> permuted_grad_weight_shape = {this->in_channels_, this->out_channels_, this->kernel_size_.first, this->kernel_size_.second};
+
+    this->grad_weight_ = convolution(this->dilation_, this->stride_, permuted_grad_weight_shape, permuted_input, permuted_grad_output, Tensor<>(), false);
+
+    cout << "grad_weight: " << endl;
+    this->grad_weight_.print();
+    cout << endl;
+
+    // The grad weight shape is permuted back to the original shape
+    this->grad_weight_ = this->grad_weight_.permute(1, 0, 2, 3);
+
+    // dL_dB = sum(dL_dY, dims=(0, 2, 3))
+    if (this->use_bias_)
+    {
+        this->grad_bias_ = Tensor<>({this->out_channels_}, 0.0);
+        for (size_t i = 0; i < grad_output.shapes()[0]; i++)
+        {
+            for (size_t j = 0; j < grad_output.shapes()[1]; j++)
+            {
+                for (size_t k = 0; k < grad_output.shapes()[2]; k++)
+                {
+                    for (size_t l = 0; l < grad_output.shapes()[3]; l++)
+                    {
+                        this->grad_bias_[j] += grad_output[i, j, k, l];
+                    }
+                }
+            }
+        }
+
+        cout << "grad_bias: " << endl;
+        this->grad_bias_.print();
+        cout << endl;
+    }
+
+    // dL_dX = fullconv(dL_dY, W)
+    Tensor<> flipped_weight = flip_vertical_and_horizontal(this->weight_);
+    cout << "flipped_weight: " << endl;
+    flipped_weight.print();
+    cout << endl;
+
+    
+    Tensor<> permuted_flipped_weight = flipped_weight.permute(1, 0, 2, 3);
+
+    cout << "permuted_flipped_weight: " << endl;
+    permuted_flipped_weight.print();
+    cout << endl;
+
+    Tensor<> copy_grad_output = grad_output;
+
+    if (this->stride_.first > 1 || this->stride_.second > 1)
+    {
+        copy_grad_output = dilate_input(copy_grad_output, this->stride_);
+    }
+
+    const size_t H_further_pad = (this->kernel_size_.first - 1) * this->dilation_.first - this->padding_.first;
+    const size_t W_further_pad = (this->kernel_size_.second - 1) * this->dilation_.second - this->padding_.second;
+
+    if (H_further_pad > 0 && W_further_pad > 0)
+    {
+        copy_grad_output = this->padding_module_.pad(copy_grad_output, {H_further_pad, W_further_pad});
+    }
+    else if (H_further_pad < 0 && W_further_pad < 0)
+    {
+        permuted_flipped_weight = this->padding_module_.pad(permuted_flipped_weight, {-H_further_pad, -W_further_pad});
+    }
+    else
+    {
+        throw std::invalid_argument("The further padding for dL/dX is not correct");
+    }
+
+    Tensor<> grad_input = convolution({1, 1}, this->dilation_, this->original_input_shape_, copy_grad_output, permuted_flipped_weight, Tensor<>(), false);
+
+    return grad_input;
+}
+
+void Conv2d::update_params(const float lr)
+{
+    this->weight_ -= this->grad_weight_ * lr;
+
+    if (this->use_bias_)
+    {
+        this->bias_ -= this->grad_bias_ * lr;
+    }
+
+    return;
+}
+
+void Conv2d::reset_parameters()
+{
+    /*
+    PyTorch implementation:
+
+    n = self.in_channels
+    for k in self.kernel_size:
+        n *= k
+    stdv = 1. / math.sqrt(n)
+    self.weight.data.uniform_(-stdv, stdv)
+    if self.bias is not None:
+        self.bias.data.uniform_(-stdv, stdv)
+    */
+
+    size_t n = this->in_channels_;
+    n *= this->kernel_size_.first * this->kernel_size_.second;
+
+    const double stdv = 1.0 / sqrt(n);
+
+    // Set up the random number generator
+    random_device rd;
+    mt19937 gen(rd());
+    uniform_real_distribution<double> dis(-stdv, stdv);
+
+    for (size_t i = 0; i < this->out_channels_; i++)
+    {
+        for (size_t j = 0; j < this->in_channels_; j++)
+        {
+            for (size_t k = 0; k < this->kernel_size_.first; k++)
+            {
+                for (size_t l = 0; l < this->kernel_size_.second; l++)
+                {
+                    this->weight_[i, j, k, l] = dis(gen);
+                }
+            }
+        }
+    }
+
+    if (this->use_bias_)
+    {
+        for (size_t i = 0; i < this->out_channels_; i++)
+        {
+            this->bias_[i] = dis(gen);
+        }
+    }
 }
diff --git a/src/modules/layers/flatten.cpp b/src/modules/layers/flatten.cpp
new file mode 100644
index 0000000..2c06798
--- /dev/null
+++ b/src/modules/layers/flatten.cpp
@@ -0,0 +1,25 @@
+#include "flatten.hpp"
+using namespace nn;
+
+Flatten::Flatten(int64_t start_dim, int64_t end_dim) : start_dim_(start_dim), end_dim_(end_dim)
+{
+    cout << "Flatten layer initialized with start_dim = " << start_dim << " and end_dim = " << end_dim << endl;
+}
+
+Tensor<> Flatten::forward(const Tensor<> &input)
+{
+    this->original_input_shape_ = input.shapes();
+
+    return input.flatten(this->start_dim_, this->end_dim_);
+}
+
+Tensor<> Flatten::backward(const Tensor<> &grad_output)
+{
+    return grad_output.reshape(this->original_input_shape_);
+}
+
+void Flatten::update_params(const float lr)
+{
+    //  we don't need to update any parameters
+    return;
+}
diff --git a/src/modules/layers/linear.cpp b/src/modules/layers/linear.cpp
index c1b01b0..fa1ee40 100644
--- a/src/modules/layers/linear.cpp
+++ b/src/modules/layers/linear.cpp
@@ -3,94 +3,117 @@
 #include "linear.hpp"
 using namespace nn;
 
-Linear::Linear(size_t in_features, size_t out_features, bool bias) : in_features_(in_features), out_features_(out_features), bias_(bias) {
-    this->weights_ = Tensor<>({in_features, out_features}, 0.0f);
-    
-    if (bias) {
-        this->biases_ = Tensor<>({out_features, 1}, 0.0f);
-    }
-    else {
-        this->biases_ = Tensor<>();
-    }
+Linear::Linear(size_t in_features, size_t out_features, bool bias) : in_features_(in_features), out_features_(out_features), use_bias_(bias)
+{
+    this->weight_ = Tensor<>({in_features, out_features}, 0.0f);
 
-    // randomize the weights. The bias is originally 0.
-    this->randomizeParams();
+    if (this->use_bias_)
+    {
+        this->bias_ = Tensor<>({out_features, 1}, 0.0f);
+    }
 
-    this->grad_weights_ = Tensor<>({in_features, out_features}, 0.0f);;
-    this->grad_biases_ = Tensor<>({out_features, 1}, 0.0f);
+    // randomize the weights and bias based on PyTorch implementation
+    this->reset_parameters();
 
-    this->input_cache_ = Tensor<>();
+    cout << "Linear layer initialized with in_features = " << in_features << " and out_features = " << out_features << endl;
+    cout << &this->input_cache_ << endl;
 }
 
-Tensor<> Linear::forward(const Tensor<>& input) {
+Tensor<> Linear::forward(const Tensor<> &input)
+{
     this->input_cache_ = input;
     size_t batchSize = input.shapes()[0];
 
-    const Tensor<>& XW = input.matmul(this->weights_);
+    const Tensor<> &XW = input.matmul(this->weight_);
 
-    if (!this->bias_) {
+    if (!this->use_bias_)
+    {
         return XW;
     }
 
     Tensor<> biases_repeated = Tensor<>({batchSize, this->out_features_}, 0.0f);
 
-    for (size_t i = 0; i < batchSize; i++) {
-        for (size_t j = 0; j < this->out_features_; j++) {
-            biases_repeated[i, j] = this->biases_[j, 0];
+    for (size_t i = 0; i < batchSize; i++)
+    {
+        for (size_t j = 0; j < this->out_features_; j++)
+        {
+            biases_repeated[i, j] = this->bias_[j, 0];
         }
     }
 
     return XW + biases_repeated;
 }
 
-Tensor<> Linear::backward(const Tensor<>& grad_output) {
+Tensor<> Linear::backward(const Tensor<> &grad_output)
+{
     // dL/dY = grad_output
 
     // dL/dW = X^T * dL/dY
-    this->grad_weights_ = this->input_cache_.transpose().matmul(grad_output);
+    this->grad_weight_ = this->input_cache_.transpose().matmul(grad_output);
 
     // cout << endl << "dL/dW: " << endl;
-    // this->grad_weights_.print();
+    // this->grad_weight_.print();
     // cout << endl;
 
     // dL/dX = dL/dY * W^T
-    Tensor<> grad_input = grad_output.matmul(this->weights_.transpose());
+    Tensor<> grad_input = grad_output.matmul(this->weight_.transpose());
 
     /*
     dL/db = dL/dY^T * 1_B (1_B is a vector of ones of size batchSize)
     dL/db = dL/dY.sum(axis=0)
     */
-    if (this->bias_) 
-        this->grad_biases_ = grad_output.transpose().matmul(Tensor<>({grad_output.shapes()[0], 1}, 1.0f));
+    if (this->use_bias_)
+        this->grad_bias_ = grad_output.transpose().matmul(Tensor<>({grad_output.shapes()[0], 1}, 1.0f));
 
     // cout << endl << "dL/db: " << endl;
-    // this->grad_biases_.print();
+    // this->grad_bias_.print();
     // cout << endl;
 
     return grad_input;
 }
 
-void Linear::update_params(const float lr) {
+void Linear::update_params(const float lr)
+{
 
-    this->weights_ -= this->grad_weights_ * lr;
-    this->biases_ -= this->grad_biases_ * lr;
+    this->weight_ -= this->grad_weight_ * lr;
+    this->bias_ -= this->grad_bias_ * lr;
 
     return;
 }
 
-void Linear::randomizeParams() {
+void Linear::reset_parameters()
+{
+    /*
+    PyTorch implementation:
+
+    stdv = 1. / math.sqrt(self.weight.size(1))
+    self.weight.data.uniform_(-stdv, stdv)
+    if self.bias is not None:
+        self.bias.data.uniform_(-stdv, stdv)
+
+    */
     // Calculate the limit for the uniform distribution
-    double limit = sqrt(6.0f / (this->in_features_ + this->out_features_));
+    const double stdv = 1.0 / sqrt(this->weight_.shapes()[0]); // since the weight is transposed
 
     // Set up the random number generator
     random_device rd;
     mt19937 gen(rd());
-    uniform_real_distribution<double> dis(-limit, limit);
+    uniform_real_distribution<double> dis(-stdv, stdv);
 
     // Xavier initialization
-    for (size_t i = 0; i < this->in_features_; i++) {
-        for (size_t j = 0; j < this->out_features_; j++) {
-            this->weights_[i, j] = dis(gen);
+    for (size_t i = 0; i < this->in_features_; i++)
+    {
+        for (size_t j = 0; j < this->out_features_; j++)
+        {
+            this->weight_[i, j] = dis(gen);
+        }
+    }
+
+    if (this->use_bias_)
+    {
+        for (size_t i = 0; i < this->out_features_; i++)
+        {
+            this->bias_[i, 0] = dis(gen);
         }
     }
 }
\ No newline at end of file
diff --git a/src/modules/losses/cross_entropy.cpp b/src/modules/losses/cross_entropy.cpp
index 002b585..b20ed72 100644
--- a/src/modules/losses/cross_entropy.cpp
+++ b/src/modules/losses/cross_entropy.cpp
@@ -3,13 +3,16 @@
 #include <math.h>
 using namespace nn;
 
-CrossEntropyLoss::CrossEntropyLoss() {
-    this->softmax_ = Softmax();
+CrossEntropyLoss::CrossEntropyLoss()
+{
+    cout << "Starting CrossEntropyLoss" << endl;
+    cout << "CrossEntropyLoss initialized" << endl;
 }
 
-double CrossEntropyLoss::forward(const Tensor<>& Y_hat, const Tensor<>& Y) {
+double CrossEntropyLoss::forward(const Tensor<> &Y_hat, const Tensor<> &Y)
+{
     /*
-    L = 1 / B \sum_{i=1}^B \sum_{j=1}^M Y_{ij} * log(softmax(Y_hat_{ij)})
+    L = 1 / B \sum_{i=1}^B \sum_{j=1}^M Y_{ij} * log(softmax(Y_hat_{ij}))
 
     R^B x M, Y R^B x M
 
@@ -18,14 +21,17 @@ double CrossEntropyLoss::forward(const Tensor<>& Y_hat, const Tensor<>& Y) {
 
     // We don't have to store the Y_hat as it is not used in the backward pass. Instead, we store the softmax(Y_hat)
     // Note that this->Y_cache_ is just a vector with label, and it is not a matrix with one-hot vectors.
-    if (Y.ndim() == 2) {
+    if (Y.ndim() == 2)
+    {
         // In this case, we assume Y is a matrix of one-hot vectors. So we can just store the index of the correct label
         this->Y_cache_ = Y.argmax().dtype<double>();
     }
-    else if (Y.ndim() == 1) {
+    else if (Y.ndim() == 1)
+    {
         this->Y_cache_ = Y;
     }
-    else {
+    else
+    {
         throw std::runtime_error("Currently, Cross Entropy Loss does not support label with more than 2 dimensions.");
     }
 
@@ -40,7 +46,8 @@ double CrossEntropyLoss::forward(const Tensor<>& Y_hat, const Tensor<>& Y) {
     // sum up all the elements
     double loss_without_factor = 0.0f;
 
-    for (int i = 0; i < B; ++i) {
+    for (int i = 0; i < B; ++i)
+    {
         // Y_{ij} * log(softmax(Y_hat_{ij}))
         loss_without_factor += log(softmax_Y_hat[i, static_cast<int>(this->Y_cache_[i])]);
     }
@@ -48,7 +55,8 @@ double CrossEntropyLoss::forward(const Tensor<>& Y_hat, const Tensor<>& Y) {
     return loss_without_factor * factor;
 }
 
-Tensor<> CrossEntropyLoss::backward() {
+Tensor<> CrossEntropyLoss::backward()
+{
     /*
     dL/dY_hat should have the same shape as Y_hat
 
@@ -68,7 +76,8 @@ Tensor<> CrossEntropyLoss::backward() {
     Since Y is a matrix of one-hot vectors, only the correct label is 1 and the rest are 0
     */
 
-    for (int i = 0; i < B; ++i) {
+    for (int i = 0; i < B; ++i)
+    {
         grad_output[i, static_cast<int>(this->Y_cache_[i])] -= 1.0f;
     }
 
diff --git a/src/utils/conv2d_utils.cpp b/src/utils/conv2d_utils.cpp
new file mode 100644
index 0000000..457e971
--- /dev/null
+++ b/src/utils/conv2d_utils.cpp
@@ -0,0 +1,260 @@
+#include "conv2d_utils.hpp"
+
+Tensor<> Padding::pad(const Tensor<> &input, const size_tp2 &padding) const
+{
+    switch (this->padding_mode_)
+    {
+    case PaddingMode::ZEROS:
+        return this->zero_pad(input, padding);
+        break;
+    default:
+        throw std::invalid_argument("Invalid padding mode");
+    }
+}
+
+Tensor<> Padding::zero_pad(const Tensor<> &input, const size_tp2 &padding) const
+{
+    const vector<size_t> &input_shape = input.shapes();
+
+    if (input_shape.size() != 4)
+    {
+        throw std::invalid_argument("Input shape must be 4D");
+    }
+
+    const size_t B = input_shape[0];
+    const size_t C = input_shape[1];
+    const size_t H = input_shape[2];
+    const size_t W = input_shape[3];
+
+    const size_t padded_H = H + padding.first * 2;
+    const size_t padded_W = W + padding.second * 2;
+
+    Tensor<> padded_output({B, C, padded_H, padded_W}, 0.0);
+
+    for (size_t b = 0; b < B; ++b)
+    {
+        for (size_t c = 0; c < C; ++c)
+        {
+            for (size_t h = 0; h < H; ++h)
+            {
+                for (size_t w = 0; w < W; ++w)
+                {
+                    padded_output[b, c, h + padding.first, w + padding.second] = input[b, c, h, w];
+                }
+            }
+        }
+    }
+
+    return padded_output;
+}
+
+Tensor<> convolution(const size_tp2 &stride, const size_tp2 &dilation, const vector<size_t> &output_shape, const Tensor<> &input, const Tensor<> &kernel, const Tensor<> &bias, bool use_bias)
+{
+    const vector<size_t> &input_shape = input.shapes();
+    const vector<size_t> &kernel_shape = kernel.shapes();
+
+    if (output_shape.size() != 4)
+    {
+        throw std::invalid_argument("Output shape must be 4D");
+    }
+    if (input_shape.size() != 4)
+    {
+        throw std::invalid_argument("Input shape must be 4D");
+    }
+    if (kernel_shape.size() != 4)
+    {
+        throw std::invalid_argument("Kernel shape must be 4D");
+    }
+
+    const size_t B = output_shape[0];
+    const size_t C_out = output_shape[1];
+    const size_t H_out = output_shape[2];
+    const size_t W_out = output_shape[3];
+
+    const size_t C_in = input_shape[1];
+    const size_t H_in = input_shape[2];
+    const size_t W_in = input_shape[3];
+
+    const size_t K_H = kernel_shape[2];
+    const size_t K_W = kernel_shape[3];
+
+    Tensor<> output(output_shape, 0.0);
+
+    /*
+    The logic behind is that
+    Let's us first focus on the first kernel among all out_channel kernels
+
+    Each input channel of the data is convolved with the same channel of the kernel, and the result is added to the output
+    Meaning that each input data channel only corresponds to the same channel of the kernel
+
+    For example, the channel 1 of the input data is convolved with the channel 1 of the kernel, but it will not be convolved with the channel 2 of the kernel
+
+    After each input data channel convolving with the same channel of the kernel, element-wise addition is performed among all the convolved result with the first kernel
+
+    Now we get a single output channel
+
+    We repeat this process for all the out_channel channels
+
+    And finally we will get an output with out_channel channels
+    */
+
+    for (size_t b = 0; b < B; ++b)
+    {
+        for (size_t c = 0; c < C_out; ++c)
+        {
+            for (size_t h = 0; h < H_out; ++h)
+            {
+                for (size_t w = 0; w < W_out; ++w)
+                {
+                    size_t h_start = h * stride.first;
+                    size_t w_start = w * stride.second;
+
+                    for (size_t ic = 0; ic < C_in; ++ic)
+                    {
+                        for (size_t kh = 0; kh < K_H; ++kh)
+                        {
+                            for (size_t kw = 0; kw < K_W; ++kw)
+                            {
+                                size_t h_in = h_start + kh * dilation.first;
+                                size_t w_in = w_start + kw * dilation.second;
+
+                                if (h_in >= 0 && h_in < H_in && w_in >= 0 && w_in < W_in)
+                                {
+                                    output[b, c, h, w] += input[b, ic, h_in, w_in] * kernel[c, ic, kh, kw];
+                                }
+                            }
+                        }
+                    }
+
+                    if (use_bias)
+                    {
+                        output[b, c, h, w] += bias[c];
+                    }
+                }
+            }
+        }
+    }
+
+    return output;
+}
+
+/**
+ * Calculate the output shape of a 2D convolutional layer.
+ *
+ * @param input_shape The shape of the input tensor, which is a 4D tensor with shape (B, C_in, H_in, W_in).
+ * @param out_channel The number of output channels.
+ * @param kernel_size The size of the kernel, which is a 2D integer pair.
+ * @param stride The stride of the convolution, which is a 2D integer pair.
+ * @param padding The padding of the convolution, which is a 2D integer pair.
+ * @param dilation The dilation of the convolution, which is a 2D integer pair.
+ * @return The output shape, which is a 4D vector with shape (B, out_channel, H_out, W_out).
+ *
+ * @throws std::invalid_argument if input_shape is not 4D or if the output shape is invalid.
+ */
+const vector<size_t> calculate_output_shape(const vector<size_t> &input_shape, const int64_t out_channel, const size_tp2 &kernel_size, const size_tp2 &stride, const size_tp2 &padding, const size_tp2 &dilation)
+{
+    if (input_shape.size() != 4)
+    {
+        throw std::invalid_argument("Input shape must be 4D");
+    }
+
+    const size_t B = input_shape[0];
+    const size_t H_in = input_shape[2];
+    const size_t W_in = input_shape[3];
+
+    cout << "Btach Size : " << B << endl;
+    cout << "H_in : " << H_in << endl;
+    cout << "W_in : " << W_in << endl;
+    cout << "Out Channel : " << out_channel << endl;
+    cout << "Kernel Size : " << kernel_size.first << ", " << kernel_size.second << endl;
+    cout << "Stride : " << stride.first << ", " << stride.second << endl;
+    cout << "Padding : " << padding.first << ", " << padding.second << endl;
+    cout << "Dilation : " << dilation.first << ", " << dilation.second << endl;
+
+    const int64_t H_out = (H_in + 2 * padding.first - dilation.first * (kernel_size.first - 1) - 1) / stride.first + 1;
+    const int64_t W_out = (W_in + 2 * padding.second - dilation.second * (kernel_size.second - 1) - 1) / stride.second + 1;
+
+    if (H_out <= 0 || W_out <= 0)
+    {
+        throw std::invalid_argument("Invalid output shape");
+    }
+
+    return {B, (size_t)out_channel, (size_t)H_out, (size_t)W_out};
+}
+
+Tensor<> flip_vertical_and_horizontal(const Tensor<> &input)
+{
+    if (input.ndim() != 4)
+    {
+        throw std::invalid_argument("Input shape must be 4D");
+    }
+
+    Tensor<> output = input;
+
+    const size_t B = input.shapes()[0];
+    const size_t C = input.shapes()[1];
+    const size_t H = input.shapes()[2];
+    const size_t W = input.shapes()[3];
+
+    double cache;
+
+    for (size_t b = 0; b < B; ++b)
+    {
+        for (size_t c = 0; c < C; ++c)
+        {
+            for (size_t h = 0; h < H / 2; ++h)
+            {
+                for (size_t w = 0; w < W; ++w)
+                {
+                    cache = output[b, c, h, w];
+                    output[b, c, h, w] = output[b, c, H - h - 1, w];
+                    output[b, c, H - h - 1, w] = cache;
+                }
+            }
+            for (size_t h = 0; h < H; ++h)
+            {
+                for (size_t w = 0; w < W / 2; ++w)
+                {
+                    cache = output[b, c, h, w];
+                    output[b, c, h, w] = output[b, c, h, W - w - 1];
+                    output[b, c, h, W - w - 1] = cache;
+                }
+            }
+        }
+    }
+
+    return output;
+}
+Tensor<> dilate_input(const Tensor<> &input, const size_tp2 &dilation)
+{
+    if (input.ndim() != 4)
+    {
+        throw std::invalid_argument("Input shape must be 4D");
+    }
+
+    const size_t B = input.shapes()[0];
+    const size_t C = input.shapes()[1];
+    const size_t H = input.shapes()[2];
+    const size_t W = input.shapes()[3];
+
+    const size_t H_dilated = H + (H - 1) * (dilation.first - 1);
+    const size_t W_dilated = W + (W - 1) * (dilation.second - 1);
+
+    Tensor<> dilated_input({B, C, H_dilated, W_dilated}, 0.0);
+
+    for (size_t b = 0; b < B; ++b)
+    {
+        for (size_t c = 0; c < C; ++c)
+        {
+            for (size_t h = 0; h < H; ++h)
+            {
+                for (size_t w = 0; w < W; ++w)
+                {
+                    dilated_input[b, c, h * dilation.first, w * dilation.second] = input[b, c, h, w];
+                }
+            }
+        }
+    }
+
+    return dilated_input;
+}
\ No newline at end of file
diff --git a/src/utils/tensor_utils.cpp b/src/utils/tensor_utils.cpp
index 1669905..0baa8a6 100644
--- a/src/utils/tensor_utils.cpp
+++ b/src/utils/tensor_utils.cpp
@@ -55,7 +55,6 @@ vector<size_t> apply_slice(const Slice& slice, size_t dim_size) {
     
     // cout << "start applying slice" << endl;
     for (size_t i = start; i < stop; i += step) {
-        // cout << "i: " << i << endl;
         indices.push_back(i);
     }
     return indices;
diff --git a/src/utils/utils.cpp b/src/utils/utils.cpp
index 6a90dfe..e4de46b 100644
--- a/src/utils/utils.cpp
+++ b/src/utils/utils.cpp
@@ -1,15 +1,17 @@
 #include "utils.hpp"
 
-void print_training_stats(int batch, float loss, float accuracy) {
+void print_stats(int batch, float loss, float accuracy)
+{
     cout << "\rBatch " << setw(4) << batch << " "
-                << "Loss: " << fixed << setprecision(5) << setw(8) << loss << " "
-                << "Accuracy: " << fixed << setprecision(2) << setw(6) << accuracy * 100 << "%"
-                << flush;
+         << "Loss: " << fixed << setprecision(5) << setw(8) << loss << " "
+         << "Accuracy: " << fixed << setprecision(2) << setw(6) << accuracy * 100 << "%"
+         << flush;
 }
 
-void print_training_stats_line(int batch, float loss, float accuracy) {
+void print_stats_line(int batch, float loss, float accuracy)
+{
     cout << "Batch " << setw(4) << batch << " "
-                << "Loss: " << fixed << setprecision(5) << setw(8) << loss << " "
-                << "Accuracy: " << fixed << setprecision(2) << setw(6) << accuracy * 100 << "%"
-                << endl;
+         << "Loss: " << fixed << setprecision(5) << setw(8) << loss << " "
+         << "Accuracy: " << fixed << setprecision(2) << setw(6) << accuracy * 100 << "%"
+         << endl;
 }
\ No newline at end of file
diff --git a/tests/core/tensor_test.cpp b/tests/core/tensor_test.cpp
index 955ecb0..9fb0c31 100644
--- a/tests/core/tensor_test.cpp
+++ b/tests/core/tensor_test.cpp
@@ -3,12 +3,14 @@
 #include "tensor.hpp"
 #include "math.h"
 
-TEST_CASE("TensorTest - Constructor and Destructor") {
+TEST_CASE("TensorTest - Constructor and Destructor")
+{
     Tensor<> tensor;
     // No explicit assertions needed, just verify no crashes
 }
 
-TEST_CASE("TensorTest - Scaler Constructor") {
+TEST_CASE("TensorTest - Scaler Constructor")
+{
     Tensor<> tensor(10.0f);
     CHECK(tensor.ndim() == 1);
     CHECK(tensor.size() == 1);
@@ -16,7 +18,8 @@ TEST_CASE("TensorTest - Scaler Constructor") {
     CHECK(tensor[0] == 10);
 }
 
-TEST_CASE("TensorTest - 1D Tensor Constructor from initializer_list") {
+TEST_CASE("TensorTest - 1D Tensor Constructor from initializer_list")
+{
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f};
     CHECK(tensor_1d.ndim() == 1);
     CHECK(tensor_1d.size() == 4);
@@ -33,7 +36,8 @@ TEST_CASE("TensorTest - 1D Tensor Constructor from initializer_list") {
     CHECK(tensor_1d_1val[0] == 0.0f);
 }
 
-TEST_CASE("TensorTest - 2D Tensor Constructor from initializer_list") {
+TEST_CASE("TensorTest - 2D Tensor Constructor from initializer_list")
+{
     Tensor<> tensor_2d = {{1.0f, 2.0f}, {3.0f, 4.0f}};
     CHECK(tensor_2d.ndim() == 2);
     CHECK(tensor_2d.size() == 4);
@@ -43,7 +47,7 @@ TEST_CASE("TensorTest - 2D Tensor Constructor from initializer_list") {
     CHECK(tensor_2d[0, 1] == 2.0f);
     CHECK(tensor_2d[1, 0] == 3.0f);
     CHECK(tensor_2d[1, 1] == 4.0f);
-    
+
     Tensor<> tensor_2d_1row = {{0.0f, 0.0f}};
     CHECK(tensor_2d_1row.ndim() == 2);
     CHECK(tensor_2d_1row.size() == 2);
@@ -61,7 +65,8 @@ TEST_CASE("TensorTest - 2D Tensor Constructor from initializer_list") {
     // CHECK(tensor_2d_1col[1, 0] == 0.0f);
 }
 
-TEST_CASE("TensorTest - 3D Tensor Constructor from initializer_list") {
+TEST_CASE("TensorTest - 3D Tensor Constructor from initializer_list")
+{
     Tensor<> tensor = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
     CHECK(tensor.ndim() == 3);
     CHECK(tensor.size() == 8);
@@ -87,7 +92,8 @@ TEST_CASE("TensorTest - 3D Tensor Constructor from initializer_list") {
     CHECK(tensor2[1, 1, 1] == 0.0f);
 }
 
-TEST_CASE("TensorTest - 1D Tensor Constructor from vector") {
+TEST_CASE("TensorTest - 1D Tensor Constructor from vector")
+{
     vector<double> data = {1.0f, 2.0f, 3.0f, 4.0f};
     Tensor<> tensor1 = data;
     CHECK(tensor1.ndim() == 1);
@@ -106,7 +112,8 @@ TEST_CASE("TensorTest - 1D Tensor Constructor from vector") {
     CHECK(tensor2[0] == 0.0f);
 }
 
-TEST_CASE("TensorTest - 2D Tensor Constructor from vector") {
+TEST_CASE("TensorTest - 2D Tensor Constructor from vector")
+{
     vector<vector<double>> data = {{1.0f, 2.0f}, {3.0f, 4.0f}};
     Tensor<> tensor = data;
     CHECK(tensor.ndim() == 2);
@@ -117,7 +124,7 @@ TEST_CASE("TensorTest - 2D Tensor Constructor from vector") {
     CHECK(tensor[0, 1] == 2.0f);
     CHECK(tensor[1, 0] == 3.0f);
     CHECK(tensor[1, 1] == 4.0f);
-    
+
     vector<vector<double>> data2 = {{0.0f, 0.0f}};
     Tensor<> tensor2 = data2;
     CHECK(tensor2.ndim() == 2);
@@ -128,7 +135,8 @@ TEST_CASE("TensorTest - 2D Tensor Constructor from vector") {
     CHECK(tensor2[0, 1] == 0.0f);
 }
 
-TEST_CASE("TensorTest - 3D Tensor Constructor from vector") {
+TEST_CASE("TensorTest - 3D Tensor Constructor from vector")
+{
     vector<vector<vector<double>>> data = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
     Tensor<> tensor = data;
     CHECK(tensor.ndim() == 3);
@@ -156,7 +164,8 @@ TEST_CASE("TensorTest - 3D Tensor Constructor from vector") {
     CHECK(tensor2[1, 1, 1] == 0.0f);
 }
 
-TEST_CASE("TensorTest - 4D Tensor Constructor from vector") {
+TEST_CASE("TensorTest - 4D Tensor Constructor from vector")
+{
     vector<vector<vector<vector<double>>>> data = {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}, {{{9.0f, 10.0f}, {11.0f, 12.0f}}, {{13.0f, 14.0f}, {15.0f, 16.0f}}}};
     Tensor<> tensor = data;
     CHECK(tensor.ndim() == 4);
@@ -172,7 +181,8 @@ TEST_CASE("TensorTest - 4D Tensor Constructor from vector") {
     CHECK(tensor[1, 1, 1, 1] == 16.0f);
 }
 
-TEST_CASE("TensorTest - Copy Constructor") {
+TEST_CASE("TensorTest - Copy Constructor")
+{
     // 1D tensor
     Tensor<> tensor1 = {1.0f, 2.0f, 3.0f, 4.0f};
     Tensor<> test_tensor = tensor1;
@@ -219,7 +229,8 @@ TEST_CASE("TensorTest - Copy Constructor") {
     CHECK(test_tensor[1, 1, 1] == 8.0f);
 }
 
-TEST_CASE("TensorTest - Move Constructor") {
+TEST_CASE("TensorTest - Move Constructor")
+{
     // 1D tensor
     Tensor<> tensor1 = {1.0f, 2.0f, 3.0f, 4.0f};
     Tensor<> test_tensor = std::move(tensor1);
@@ -266,7 +277,8 @@ TEST_CASE("TensorTest - Move Constructor") {
     CHECK(test_tensor[1, 1, 1] == 8.0f);
 }
 
-TEST_CASE("TensorTest - Certain Value Constructor") {
+TEST_CASE("TensorTest - Certain Value Constructor")
+{
     Tensor<> tensor_1d({1}, 0.0f);
     CHECK(tensor_1d.ndim() == 1);
     CHECK(tensor_1d.size() == 1);
@@ -283,7 +295,6 @@ TEST_CASE("TensorTest - Certain Value Constructor") {
     CHECK(tensor_2d[1, 0] == 10.0f);
     CHECK(tensor_2d[1, 1] == 10.0f);
 
-
     Tensor<> tensor_3d({2, 2, 2}, 5.0f);
     CHECK(tensor_3d.ndim() == 3);
     CHECK(tensor_3d.size() == 8);
@@ -297,7 +308,8 @@ TEST_CASE("TensorTest - Certain Value Constructor") {
     CHECK(tensor_3d[1, 1, 1] == 5.0f);
 }
 
-TEST_CASE("TensorTest - Indexing Operator") {
+TEST_CASE("TensorTest - Indexing Operator")
+{
     Tensor<> tensor = {1.0f, 2.0f, 3.0f, 4.0f};
     CHECK(tensor[0] == 1.0f);
     CHECK(tensor[1] == 2.0f);
@@ -318,7 +330,8 @@ TEST_CASE("TensorTest - Indexing Operator") {
     CHECK(tensor[1, 1, 1] == 8.0f);
 }
 
-TEST_CASE("TensorTest - Indexing Operator - Out of Bound") {
+TEST_CASE("TensorTest - Indexing Operator - Out of Bound")
+{
     Tensor<> tensor = {1.0f, 2.0f, 3.0f, 4.0f};
     CHECK_THROWS(tensor[4]);
 
@@ -332,7 +345,8 @@ TEST_CASE("TensorTest - Indexing Operator - Out of Bound") {
     CHECK_THROWS(tensor[0, 0, 2]);
 }
 
-TEST_CASE("TensorTest - Indexing Operator - Negative Indexing") {
+TEST_CASE("TensorTest - Indexing Operator - Negative Indexing")
+{
     Tensor<> tensor = {1.0f, 2.0f, 3.0f, 4.0f};
     CHECK(tensor[-1] == 4.0f);
     CHECK(tensor[-2] == 3.0f);
@@ -350,7 +364,8 @@ TEST_CASE("TensorTest - Indexing Operator - Negative Indexing") {
     CHECK(tensor[0, -1, 0] == 3.0f);
 }
 
-TEST_CASE("TensorTest - Indexing Operator - Normal Slicing") {
+TEST_CASE("TensorTest - Indexing Operator - Normal Slicing")
+{
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f};
     Tensor<> sliced_tensor_1d_1 = tensor_1d.index({":2"});
     CHECK(sliced_tensor_1d_1.ndim() == 1);
@@ -391,7 +406,8 @@ TEST_CASE("TensorTest - Indexing Operator - Normal Slicing") {
     CHECK(sliced_tensor_2d_1[1, 1] == 4.0f);
 }
 
-TEST_CASE("TensorTest - Transpose") {
+TEST_CASE("TensorTest - Transpose")
+{
     Tensor<> tensor_2d = {{1.0f, 2.0f}, {3.0f, 4.0f}};
     Tensor<> transposed_tensor_2d = tensor_2d.transpose();
     CHECK(transposed_tensor_2d.ndim() == 2);
@@ -415,50 +431,53 @@ TEST_CASE("TensorTest - Transpose") {
     CHECK(transposed_tensor_1d[-1, -1] == 4.0f);
 }
 
-TEST_CASE("TensorTest - flatten") {
+TEST_CASE("TensorTest - flatten")
+{
     Tensor<> tensor_2d = {{1.0f, 2.0f}, {3.0f, 4.0f}};
-    tensor_2d.flatten();
-    CHECK(tensor_2d.ndim() == 1);
-    CHECK(tensor_2d.size() == 4);
-    CHECK(tensor_2d.shapes()[0] == 4);
-    CHECK(tensor_2d[0] == 1.0f);
-    CHECK(tensor_2d[1] == 2.0f);
-    CHECK(tensor_2d[2] == 3.0f);
-    CHECK(tensor_2d[3] == 4.0f);
+    Tensor<> flattened_tensor_2d = tensor_2d.flatten();
+    CHECK(flattened_tensor_2d.ndim() == 1);
+    CHECK(flattened_tensor_2d.size() == 4);
+    CHECK(flattened_tensor_2d.shapes()[0] == 4);
+    CHECK(flattened_tensor_2d[0] == 1.0f);
+    CHECK(flattened_tensor_2d[1] == 2.0f);
+    CHECK(flattened_tensor_2d[2] == 3.0f);
+    CHECK(flattened_tensor_2d[3] == 4.0f);
 }
 
-TEST_CASE("TensorTest - reshape") {
+TEST_CASE("TensorTest - reshape")
+{
     Tensor<> tensor_2d = {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}};
-    tensor_2d.reshape({2, 3, 2});
-    CHECK(tensor_2d.ndim() == 3);
-    CHECK(tensor_2d.size() == 12);
-    CHECK(tensor_2d.shapes()[0] == 2);
-    CHECK(tensor_2d.shapes()[1] == 3);
-    CHECK(tensor_2d.shapes()[2] == 2);
-    CHECK(tensor_2d[0, 0, 0] == 1.0f);
-    CHECK(tensor_2d[0, 0, 1] == 2.0f);
-    CHECK(tensor_2d[0, 1, 0] == 3.0f);
-    CHECK(tensor_2d[0, 1, 1] == 4.0f);
-    CHECK(tensor_2d[0, 2, 0] == 5.0f);
-    CHECK(tensor_2d[0, 2, 1] == 6.0f);
-    CHECK(tensor_2d[1, 0, 0] == 7.0f);
-    CHECK(tensor_2d[-1, -1, -1] == 12.0f);
+    Tensor<> reshaped_tensor_2d = tensor_2d.reshape({2, 3, 2});
+    CHECK(reshaped_tensor_2d.ndim() == 3);
+    CHECK(reshaped_tensor_2d.size() == 12);
+    CHECK(reshaped_tensor_2d.shapes()[0] == 2);
+    CHECK(reshaped_tensor_2d.shapes()[1] == 3);
+    CHECK(reshaped_tensor_2d.shapes()[2] == 2);
+    CHECK(reshaped_tensor_2d[0, 0, 0] == 1.0f);
+    CHECK(reshaped_tensor_2d[0, 0, 1] == 2.0f);
+    CHECK(reshaped_tensor_2d[0, 1, 0] == 3.0f);
+    CHECK(reshaped_tensor_2d[0, 1, 1] == 4.0f);
+    CHECK(reshaped_tensor_2d[0, 2, 0] == 5.0f);
+    CHECK(reshaped_tensor_2d[0, 2, 1] == 6.0f);
+    CHECK(reshaped_tensor_2d[1, 0, 0] == 7.0f);
+    CHECK(reshaped_tensor_2d[-1, -1, -1] == 12.0f);
 
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-    tensor_1d.reshape({2, 3});
-    CHECK(tensor_1d.ndim() == 2);
-    CHECK(tensor_1d.size() == 6);
-    CHECK(tensor_1d.shapes()[0] == 2);
-    CHECK(tensor_1d.shapes()[1] == 3);
-    CHECK(tensor_1d[0, 0] == 1.0f);
-    CHECK(tensor_1d[0, 1] == 2.0f);
-    CHECK(tensor_1d[0, 2] == 3.0f);
-    CHECK(tensor_1d[1, 0] == 4.0f);
-    CHECK(tensor_1d[1, 1] == 5.0f);
-    CHECK(tensor_1d[1, 2] == 6.0f);
+    Tensor<> reshaped_tensor_1d = tensor_1d.reshape({2, 3});
+    CHECK(reshaped_tensor_1d.ndim() == 2);
+    CHECK(reshaped_tensor_1d.size() == 6);
+    CHECK(reshaped_tensor_1d.shapes()[0] == 2);
+    CHECK(reshaped_tensor_1d.shapes()[1] == 3);
+    CHECK(reshaped_tensor_1d[0, 0] == 1.0f);
+    CHECK(reshaped_tensor_1d[0, 1] == 2.0f);
+    CHECK(reshaped_tensor_1d[0, 2] == 3.0f);
+    CHECK(reshaped_tensor_1d[1, 0] == 4.0f);
+    CHECK(reshaped_tensor_1d[1, 1] == 5.0f);
+    CHECK(reshaped_tensor_1d[1, 2] == 6.0f);
 }
 
-TEST_CASE("TensorTest - abs") {
+TEST_CASE("TensorTest - abs")
+{
     Tensor<> tensor_2d = {{-1.0f, -2.0f}, {3.0f, 4.0f}};
     Tensor<> abs_tensor_2d = tensor_2d.abs();
     CHECK(abs_tensor_2d.ndim() == 2);
@@ -471,7 +490,8 @@ TEST_CASE("TensorTest - abs") {
     CHECK(abs_tensor_2d[1, 1] == 4.0f);
 }
 
-TEST_CASE("TensorTest - sum") {
+TEST_CASE("TensorTest - sum")
+{
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f};
     double sum_1d = tensor_1d.sum();
     CHECK(sum_1d == 10.0f);
@@ -485,9 +505,11 @@ TEST_CASE("TensorTest - sum") {
     CHECK(sum_3d == 36.0f);
 }
 
-TEST_CASE("TensorTest - filter") {
+TEST_CASE("TensorTest - filter")
+{
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f};
-    Tensor<> filtered_tensor_1d = tensor_1d.filter([](double x) { return x < 3.0f; });
+    Tensor<> filtered_tensor_1d = tensor_1d.filter([](double x)
+                                                   { return x < 3.0f; });
     CHECK(filtered_tensor_1d.ndim() == 1);
     CHECK(filtered_tensor_1d.size() == 4);
     CHECK(filtered_tensor_1d.shapes()[0] == 4);
@@ -497,7 +519,8 @@ TEST_CASE("TensorTest - filter") {
     CHECK(filtered_tensor_1d[3] == 0.0f);
 
     Tensor<> tensor_2d = {{1.0f, 2.0f}, {3.0f, 4.0f}};
-    Tensor<> filtered_tensor_2d = tensor_2d.filter([](double x) { return x < 3.0f; });
+    Tensor<> filtered_tensor_2d = tensor_2d.filter([](double x)
+                                                   { return x < 3.0f; });
     CHECK(filtered_tensor_2d.ndim() == 2);
     CHECK(filtered_tensor_2d.size() == 4);
     CHECK(filtered_tensor_2d.shapes()[0] == 2);
@@ -508,7 +531,8 @@ TEST_CASE("TensorTest - filter") {
     CHECK(filtered_tensor_2d[1, 1] == 0.0f);
 
     Tensor<> tensor_3d = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
-    Tensor<> filtered_tensor_3d = tensor_3d.filter([](double x) { return x < 3.0f; });
+    Tensor<> filtered_tensor_3d = tensor_3d.filter([](double x)
+                                                   { return x < 3.0f; });
     CHECK(filtered_tensor_3d.ndim() == 3);
     CHECK(filtered_tensor_3d.size() == 8);
     CHECK(filtered_tensor_3d.shapes()[0] == 2);
@@ -524,11 +548,13 @@ TEST_CASE("TensorTest - filter") {
     CHECK(filtered_tensor_3d[1, 1, 1] == 0.0f);
 }
 
-TEST_CASE("TensorTest - map") {
+TEST_CASE("TensorTest - map")
+{
     double eps = 1e-5f;
 
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f};
-    Tensor<> tensor_1d_exp = tensor_1d.map([](double x) { return exp(x); });
+    Tensor<> tensor_1d_exp = tensor_1d.map([](double x)
+                                           { return exp(x); });
     CHECK(tensor_1d_exp.ndim() == 1);
     CHECK(tensor_1d_exp.size() == 4);
     CHECK(tensor_1d_exp.shapes()[0] == 4);
@@ -538,7 +564,8 @@ TEST_CASE("TensorTest - map") {
     CHECK(tensor_1d_exp[3] - exp(4.0f) < eps);
 
     Tensor<> tensor_2d = {{1.0f, 2.0f}, {3.0f, 4.0f}};
-    Tensor<> tensor_2d_times_10 = tensor_2d.map([](double x) { return x * 10.0f; });
+    Tensor<> tensor_2d_times_10 = tensor_2d.map([](double x)
+                                                { return x * 10.0f; });
     CHECK(tensor_2d_times_10.ndim() == 2);
     CHECK(tensor_2d_times_10.size() == 4);
     CHECK(tensor_2d_times_10.shapes()[0] == 2);
@@ -549,7 +576,8 @@ TEST_CASE("TensorTest - map") {
     CHECK(tensor_2d_times_10[1, 1] == 40.0f);
 
     Tensor<> tensor_3d = {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}};
-    Tensor<> tensor_3d_log = tensor_3d.map([](double x) { return log(x); });
+    Tensor<> tensor_3d_log = tensor_3d.map([](double x)
+                                           { return log(x); });
     CHECK(tensor_3d_log.ndim() == 3);
     CHECK(tensor_3d_log.size() == 8);
     CHECK(tensor_3d_log.shapes()[0] == 2);
@@ -565,7 +593,8 @@ TEST_CASE("TensorTest - map") {
     CHECK(tensor_3d_log[1, 1, 1] - log(8.0f) < eps);
 }
 
-TEST_CASE("TensorTest - equal") {
+TEST_CASE("TensorTest - equal")
+{
     Tensor<> tensor_1d = {1.0f, 2.0f, 3.0f, 4.0f};
     Tensor<> another_tensor_1d = {1.0f, 2.0f, 5.0f, 4.0f};
     Tensor<int> equal_tensor_1d = tensor_1d.equal(another_tensor_1d);
@@ -607,7 +636,8 @@ TEST_CASE("TensorTest - equal") {
     CHECK(equal_tensor_3d[1, 1, 1] == 1);
 }
 
-TEST_CASE("TensorTest - Matrix Multiplication") {
+TEST_CASE("TensorTest - Matrix Multiplication")
+{
     Tensor<> tensor_2d_1 = {{1.0f, 2.0f}, {3.0f, 4.0f}};
     Tensor<> transposed_tensor_2d_1 = tensor_2d_1.transpose();
     Tensor<> matrix_multiplication_2d_1 = tensor_2d_1.matmul(transposed_tensor_2d_1);
@@ -620,5 +650,4 @@ TEST_CASE("TensorTest - Matrix Multiplication") {
     CHECK(matrix_multiplication_2d_1[0, 1] == 11.0f);
     CHECK(matrix_multiplication_2d_1[1, 0] == 11.0f);
     CHECK(matrix_multiplication_2d_1[1, 1] == 25.0f);
-    
 }
\ No newline at end of file