diff --git a/core/Inst.hpp b/core/Inst.hpp
index a5987f4d..8ae06469 100644
--- a/core/Inst.hpp
+++ b/core/Inst.hpp
@@ -259,6 +259,9 @@ namespace olympia
 
         uint32_t getOpCode() const { return static_cast<uint32_t>(opcode_info_->getOpcode()); }
 
+        // Get the data size in bytes
+        uint32_t getMemAccessSize() const { return static_cast<uint32_t>(opcode_info_->getDataSize() / 8); }  // opcode_info's data size is in bits
+
         mavis::InstructionUniqueID getMavisUid() const
         {
             return opcode_info_->getInstructionUniqueID();
diff --git a/core/lsu/LSU.cpp b/core/lsu/LSU.cpp
index 5be73076..642ac6c9 100644
--- a/core/lsu/LSU.cpp
+++ b/core/lsu/LSU.cpp
@@ -20,6 +20,8 @@ namespace olympia
         replay_buffer_("replay_buffer", p->replay_buffer_size, getClock()),
         replay_buffer_size_(p->replay_buffer_size),
         replay_issue_delay_(p->replay_issue_delay),
+        store_buffer_("store_buffer", p->ldst_inst_queue_size, getClock()),  // Add this line
+        store_buffer_size_(p->ldst_inst_queue_size),
         ready_queue_(),
         load_store_info_allocator_(sparta::notNull(OlympiaAllocators::getOlympiaAllocators(node))
                                        ->load_store_info_allocator),
@@ -35,7 +37,8 @@ namespace olympia
             + p->cache_read_stage_length), // Complete stage is after the cache read stage
         ldst_pipeline_("LoadStorePipeline", (complete_stage_ + 1),
                        getClock()), // complete_stage_ + 1 is number of stages
-        allow_speculative_load_exec_(p->allow_speculative_load_exec)
+        allow_speculative_load_exec_(p->allow_speculative_load_exec),
+        allow_data_forwarding_(p->allow_data_forwarding)
     {
         sparta_assert(p->mmu_lookup_stage_length > 0,
                       "MMU lookup stage should atleast be one cycle");
@@ -48,6 +51,7 @@ namespace olympia
         ldst_pipeline_.enableCollection(node);
         ldst_inst_queue_.enableCollection(node);
         replay_buffer_.enableCollection(node);
+        store_buffer_.enableCollection(node);
 
         // Startup handler for sending initial credits
         sparta::StartupEvent(node, CREATE_SPARTA_HANDLER(LSU, sendInitialCredits_));
@@ -177,6 +181,12 @@ namespace olympia
     {
         ILOG("New instruction added to the ldst queue " << inst_ptr);
         allocateInstToIssueQueue_(inst_ptr);
+        // allocate to Store buffer
+        if (inst_ptr->isStoreInst())
+        {
+            allocateInstToStoreBuffer_(inst_ptr);
+        }
+
         handleOperandIssueCheck_(inst_ptr);
         lsu_insts_dispatched_++;
     }
@@ -265,7 +275,19 @@ namespace olympia
         sparta_assert(inst_ptr->getStatus() == Inst::Status::RETIRED,
                       "Get ROB Ack, but the store inst hasn't retired yet!");
 
-        ++stores_retired_;
+        if (inst_ptr->isStoreInst())
+        {
+            auto oldest_store = getOldestStore_();
+            sparta_assert(oldest_store && oldest_store->getInstPtr()->getUniqueID() == inst_ptr->getUniqueID(),
+                     "Attempting to retire store out of order! Expected: " 
+                     << (oldest_store ? oldest_store->getInstPtr()->getUniqueID() : 0)
+                     << " Got: " << inst_ptr->getUniqueID());
+        
+            // Remove from store buffer -> don't actually need to send cache request
+            store_buffer_.erase(store_buffer_.begin());;
+            ++stores_retired_;
+        }
+
 
         updateIssuePriorityAfterStoreInstRetire_(inst_ptr);
         if (isReadyToIssueInsts_())
@@ -447,7 +469,7 @@ namespace olympia
             {
                 updateInstReplayReady_(load_store_info_ptr);
             }
-            // There might not be a wake up because the cache cannot handle nay more instruction
+            // There might not be a wake up because the cache cannot handle any more instruction
             // Change to nack wakeup when implemented
             if (!load_store_info_ptr->isInReadyQueue())
             {
@@ -468,7 +490,7 @@ namespace olympia
         // If have passed translation and the instruction is a store,
         // then it's good to be retired (i.e. mark it completed).
         // Stores typically do not cause a flush after a successful
-        // translation.  We now wait for the Retire block to "retire"
+        // translation. We now wait for the Retire block to "retire"
         // it, meaning it's good to go to the cache
         if (inst_ptr->isStoreInst() && (inst_ptr->getStatus() == Inst::Status::SCHEDULED))
         {
@@ -483,14 +505,14 @@ namespace olympia
             return;
         }
 
-        // Loads dont perform a cache lookup if there are older stores present in the load store
-        // queue
-        if (!inst_ptr->isStoreInst() && olderStoresExists_(inst_ptr)
+        // Loads don't perform a cache lookup if there are older stores haven't issued in the load store queue
+        if (!inst_ptr->isStoreInst() && !allOlderStoresIssued_(inst_ptr)
             && allow_speculative_load_exec_)
         {
             ILOG("Dropping speculative load " << inst_ptr);
             load_store_info_ptr->setState(LoadStoreInstInfo::IssueState::READY);
             ldst_pipeline_.invalidateStage(cache_lookup_stage_);
+            // TODO: double check whether "allow_speculative_load_exec_" means not allow
             if (allow_speculative_load_exec_)
             {
                 updateInstReplayReady_(load_store_info_ptr);
@@ -498,6 +520,21 @@ namespace olympia
             return;
         }
 
+        // Add store forwarding check here for loads
+        if (!inst_ptr->isStoreInst() && allow_data_forwarding_)
+        {
+            // passing both load address and load size
+            auto forwarding_store = tryStoreToLoadForwarding(inst_ptr);
+
+            if (forwarding_store)
+            {
+                ILOG("Found forwarding store for load " << inst_ptr);
+                mem_access_info_ptr->setDataReady(true);
+                mem_access_info_ptr->setCacheState(MemoryAccessInfo::CacheState::HIT);
+                return;
+            }
+        }
+
         const bool is_already_hit =
             (mem_access_info_ptr->getCacheState() == MemoryAccessInfo::CacheState::HIT);
         const bool is_unretired_store =
@@ -790,6 +827,7 @@ namespace olympia
         flushIssueQueue_(criteria);
         flushReplayBuffer_(criteria);
         flushReadyQueue_(criteria);
+        flushStoreBuffer_(criteria);
 
         // Cancel replay events
         auto flush = [&criteria](const LoadStoreInstInfoPtr & ldst_info_ptr) -> bool
@@ -894,6 +932,89 @@ namespace olympia
         ILOG("Append new load/store instruction to issue queue!");
     }
 
+    void LSU::allocateInstToStoreBuffer_(const InstPtr & inst_ptr)
+    {
+        const auto & store_info_ptr = createLoadStoreInst_(inst_ptr);
+
+        sparta_assert(store_buffer_.size() < ldst_inst_queue_size_,
+                      "Appending store buffer causes overflows!");
+
+        store_buffer_.push_back(store_info_ptr);
+        ILOG("Store added to store buffer: " << inst_ptr);
+    }
+
+    bool LSU::tryStoreToLoadForwarding(const InstPtr& load_inst_ptr) const
+    {
+        const uint64_t load_addr = load_inst_ptr->getTargetVAddr();
+        const uint32_t load_size = load_inst_ptr->getMemAccessSize();
+
+        // A load must have a non-zero size to access memory.
+        if (load_size == 0) {
+            return false;
+        }
+
+        std::vector<bool> coverage_mask(load_size, false);
+        uint32_t bytes_covered_count = 0;
+
+        // Iterate through the store_buffer_ from youngest to oldest.
+        // This ensures that if multiple stores write to the same byte,
+        // the data from the youngest store is effectively used.
+        for (auto it = store_buffer_.rbegin(); it != store_buffer_.rend(); ++it)
+        {
+            const auto& store_info_ptr = *it; // LoadStoreInstInfoPtr
+            const InstPtr& store_inst_ptr = store_info_ptr->getInstPtr();
+
+            const uint64_t store_addr = store_inst_ptr->getTargetVAddr();
+            const uint32_t store_size = store_inst_ptr->getMemAccessSize();
+
+            if (store_size == 0) {
+                continue; // Skip stores that don't actually write data.
+            }
+
+            // Determine the overlapping region [overlap_start_addr, overlap_end_addr)
+            // The overlap is in terms of global memory addresses.
+            uint64_t overlap_start_addr = std::max(load_addr, store_addr);
+            uint64_t overlap_end_addr   = std::min(load_addr + load_size, store_addr + store_size);
+
+            // If there's an actual overlap (i.e., the range is not empty)
+            if (overlap_start_addr < overlap_end_addr)
+            {
+                // Iterate over the bytes *within the load's address range* that this store covers.
+                for (uint64_t current_byte_global_addr = overlap_start_addr; current_byte_global_addr < overlap_end_addr; ++current_byte_global_addr)
+                {
+                    // Calculate the index of this byte relative to the load's start address.
+                    // This index is used for the coverage_mask.
+                    uint32_t load_byte_idx = static_cast<uint32_t>(current_byte_global_addr - load_addr);
+
+                    // If this byte within the load's coverage_mask hasn't been marked true yet
+                    // (meaning it hasn't been covered by an even younger store), mark it.
+                    if (!coverage_mask[load_byte_idx])
+                    {
+                        coverage_mask[load_byte_idx] = true;
+                        bytes_covered_count++;
+                    }
+                }
+            }
+
+            // If all bytes of the load are now covered, no need to check even older stores
+            if (bytes_covered_count == load_size) {
+                break;
+            }
+        }
+
+        // Check if all bytes required by the load were covered by stores in the buffer.
+        return bytes_covered_count == load_size;
+    }
+
+
+    LoadStoreInstInfoPtr  LSU::getOldestStore_() const
+    {
+        if(store_buffer_.empty()) {
+            return nullptr;
+        }
+        return store_buffer_.read(0);
+    }
+
     bool LSU::allOlderStoresIssued_(const InstPtr & inst_ptr)
     {
         for (const auto & ldst_info_ptr : ldst_inst_queue_)
@@ -1368,4 +1489,20 @@ namespace olympia
         }
     }
 
+    void LSU::flushStoreBuffer_(const FlushCriteria & criteria)
+    {
+        auto sb_iter = store_buffer_.begin();
+        while(sb_iter != store_buffer_.end()) {
+            auto inst_ptr = (*sb_iter)->getInstPtr();
+            if(criteria.includedInFlush(inst_ptr)) {
+                auto delete_iter = sb_iter++;
+                // store buffer didn't return an iterator
+                store_buffer_.erase(delete_iter);
+                ILOG("Flushed store from store buffer: " << inst_ptr);
+            } else {
+                ++sb_iter;
+            }
+        }
+    }
+
 } // namespace olympia
diff --git a/core/lsu/LSU.hpp b/core/lsu/LSU.hpp
index 0896169c..ea20adef 100644
--- a/core/lsu/LSU.hpp
+++ b/core/lsu/LSU.hpp
@@ -50,10 +50,14 @@ namespace olympia
             PARAMETER(uint32_t, ldst_inst_queue_size, 8, "LSU ldst inst queue size")
             PARAMETER(uint32_t, replay_buffer_size, ldst_inst_queue_size, "Replay buffer size")
             PARAMETER(uint32_t, replay_issue_delay, 3, "Replay Issue delay")
+            // PARAMETER(uint32_t, store_buffer_size, ldst_inst_queue_size, "Size of the store buffer")
             // LSU microarchitecture parameters
             PARAMETER(
-                bool, allow_speculative_load_exec, true,
+                bool, allow_speculative_load_exec, false,
                 "Allow loads to proceed speculatively before all older store addresses are known")
+            PARAMETER(
+                bool, allow_data_forwarding, true,
+                "Allow data forwarding to bypass the cache look up / memory access")
             // Pipeline length
             PARAMETER(uint32_t, mmu_lookup_stage_length, 1, "Length of the mmu lookup stage")
             PARAMETER(uint32_t, cache_lookup_stage_length, 1, "Length of the cache lookup stage")
@@ -73,6 +77,11 @@ namespace olympia
         //! name of this resource.
         static const char name[];
 
+        // return allow_data_forwarding for test
+        bool allowDataForwardingEX() const {
+            return allow_data_forwarding_;
+        }
+
         ////////////////////////////////////////////////////////////////////////////////
         // Type Name/Alias Declaration
         ////////////////////////////////////////////////////////////////////////////////
@@ -137,6 +146,10 @@ namespace olympia
         const uint32_t replay_buffer_size_;
         const uint32_t replay_issue_delay_;
 
+        // Store Buffer
+        sparta::Buffer<LoadStoreInstInfoPtr> store_buffer_;
+        const uint32_t store_buffer_size_;
+
         sparta::PriorityQueue<LoadStoreInstInfoPtr> ready_queue_;
         // MMU unit
         bool mmu_busy_ = false;
@@ -169,6 +182,7 @@ namespace olympia
 
         // LSU Microarchitecture parameters
         const bool allow_speculative_load_exec_;
+        const bool allow_data_forwarding_;
 
         // ROB stopped simulation early, transactions could still be inflight.
         bool rob_stopped_simulation_ = false;
@@ -258,6 +272,15 @@ namespace olympia
 
         void allocateInstToIssueQueue_(const InstPtr & inst_ptr);
 
+        // allocate store inst to store buffer
+        void allocateInstToStoreBuffer_(const InstPtr & inst_ptr);
+
+        // check whether load inst could be forwarded by store
+        bool tryStoreToLoadForwarding(const InstPtr& load_inst_ptr) const ;
+
+        // get oldest store
+        LoadStoreInstInfoPtr getOldestStore_() const;
+
         bool olderStoresExists_(const InstPtr & inst_ptr);
 
         bool allOlderStoresIssued_(const InstPtr & inst_ptr);
@@ -315,6 +338,8 @@ namespace olympia
         // Flush Replay Buffer
         void flushReplayBuffer_(const FlushCriteria &);
 
+        void flushStoreBuffer_(const FlushCriteria &);
+
         // Counters
         sparta::Counter lsu_insts_dispatched_{getStatisticSet(), "lsu_insts_dispatched",
                                               "Number of LSU instructions dispatched",
diff --git a/docs/design_document_template/LSU.adoc b/docs/design_document_template/LSU.adoc
new file mode 100644
index 00000000..c277686b
--- /dev/null
+++ b/docs/design_document_template/LSU.adoc
@@ -0,0 +1,256 @@
+:doctitle: Olympia Load Store Unit (LSU) Design Document
+
+:toc:
+
+[[Document_Information]]
+== Document Information
+
+=== Revision History
+
+[width="100%",cols="11%,11%,16%,62%",options="header",]
+|===
+|*Revision* |*Date*      |*Author*  |*Summary of Changes*
+|0.1        | 2024.12.13 | Team     | Initial LSU design document with multi-pipeline and data forwarding
+|===
+
+=== Conventions and Terminology
+
+[width="100%",cols="17%,83%",options="header",]
+|===
+|Label |Description
+|LSU |Load Store Unit - Handles all memory operations
+|MMU |Memory Management Unit - Handles virtual to physical address translation
+|ROB |ReOrder Buffer - Ensures in-order commitment of instructions
+|TLB |Translation Lookaside Buffer - Cache for virtual to physical address translations
+|RAW |Read After Write hazard - Load depends on earlier store
+|WAW |Write After Write hazard - Store depends on earlier store
+|CSB |Committed Store Buffer - Holds retired stores waiting to write to memory
+|===
+
+=== Related Documents
+
+[width="100%",cols="25%,75%",options="header",]
+|===
+|*Title* |*Description*
+|The RISC-V Instruction Set Manual Volume I |Unprivileged Architecture Version 2024041
+|Olympia Core Architecture |Core architecture specification
+|Core Memory Model |Memory subsystem specification
+|===
+
+=== Notes/Open Issues
+
+* Optimization of store buffer search for data forwarding
+* Handling of cache bank conflicts with multiple pipelines
+* Fine-tuning of pipeline stage lengths for optimal performance
+
+== OVERVIEW
+
+The Load Store Unit (LSU) implements the memory interface for the Olympia processor. Managing all load and store operations. The LSU features multiple parallel pipelines, data forwarding capabilities, and ensures memory consistency while maintaining high performance through careful hazard management and efficient queueing structures.
+
+=== Overview Block Diagram
+
+image::./media/LSU.png[LSU Block Diagram]
+
+Figure 1 - LSU Block Diagram
+
+== Functional Description
+
+=== Unit Block Description
+
+The LSU consists of several key functional blocks:
+
+1. *Instruction Queues*
+   - Load/Store Instruction Queue (ldst_inst_queue_)
+   - Store Buffer (store_buffer_)
+   - Ready Queue (ready_queue_)
+
+2. *Pipeline Units*
+   - Multiple parallel Load/Store pipelines
+   - Address Generation Units
+   - Data Forwarding Logic
+
+3. *Interface Controllers*
+   - MMU Interface
+   - Cache Interface
+   - ROB Interface
+
+=== Key Components Detail
+
+==== Load/Store Instruction Queue (ldst_inst_queue_)
+* Size: Configurable through ldst_inst_queue_size_ parameter
+* Purpose: Holds instructions from dispatch until ready for execution
+* Implementation: sparta::Buffer template with LoadStoreInstInfoPtr
+* Key Methods:
+[source,cpp]
+----
+void allocateInstToIssueQueue_(const InstPtr & inst_ptr)
+void popIssueQueue_(const LoadStoreInstInfoPtr & inst_ptr)
+----
+
+==== Store Buffer
+* Size: Matches ldst_inst_queue_size_
+* Purpose:
+  - Maintains program order for stores
+  - Enables store-to-load forwarding
+  - Tracks uncommitted stores
+* Implementation:
+[source,cpp]
+----
+sparta::Buffer<LoadStoreInstInfoPtr> store_buffer_;
+LoadStoreInstInfoPtr findYoungestMatchingStore_(const uint64_t addr) const;
+----
+
+==== Pipeline Stages
+
+[width="100%",cols="20%,15%,65%",options="header",]
+|===
+|Stage |Cycles |Function
+|Address Calculation |1 |Virtual address generation
+|MMU Lookup |1-N |Address translation
+|Cache Lookup |1-N |Cache access initiation
+|Cache Read |1 |Data retrieval
+|Complete |1 |Instruction completion
+|===
+
+=== Operation Flow
+
+1. *Instruction Receipt*
+   - Receives instructions from dispatch
+   - Allocates queue entries
+   - Begins tracking dependencies
+
+2. *Issue Stage*
+   - Checks operand readiness
+   - Verifies no hazards exist
+   - Selects ready instructions for execution
+
+3. *Execution*
+   - Address calculation
+   - MMU interaction
+   - Cache access
+   - Data forwarding when applicable
+
+4. *Completion*
+   - Updates architectural state
+   - Handles exceptions
+   - Signals ROB for retirement
+
+=== Data Forwarding Implementation
+
+The data forwarding logic is implemented through the store buffer and involves:
+
+1. *Store Buffer Search*
+   [source,cpp]
+   ----
+   LoadStoreInstInfoPtr findYoungestMatchingStore_(const uint64_t addr) const {
+       auto it = std::find_if(store_buffer_.rbegin(), store_buffer_.rend(),
+                             [addr](const auto& store) {
+                                 return store->getInstPtr()->getTargetVAddr() == addr;
+                             });
+       return (it != store_buffer_.rend()) ? *it : nullptr;
+   }
+   ----
+   This initial search method identifies the youngest store that matches a specific address. However, for comprehensive forwarding, especially when a load might be covered by multiple stores or a store might partially cover a load, a more detailed check is required.
+
+2. *Forward Detection*
+   [source,cpp]
+   ----
+   void handleCacheLookupReq_() {
+       // ...
+       if (!inst_ptr->isStoreInst() && allow_data_forwarding_) {
+           const uint64_t load_addr = inst_ptr->getTargetVAddr();
+             // The original findYoungestMatchingStore_ might be a quick initial check
+           // auto forwarding_store = findYoungestMatchingStore_(load_addr);
+             // A more comprehensive check like tryStoreToLoadForwarding is needed for full/partial coverage:
+           if (tryStoreToLoadForwarding(inst_ptr, mem_access_info_ptr)) { // <1>
+               mem_access_info_ptr->setDataReady(true);
+               mem_access_info_ptr->setCacheState(MemoryAccessInfo::CacheState::HIT);
+               return;
+           }
+       }
+       // ...
+   }
+   ----
+   <1> `tryStoreToLoadForwarding` is a more detailed function to check full data coverage.
+
+[[Comprehensive_Forwarding_Check]]
+==== Comprehensive Forwarding Check (tryStoreToLoadForwarding)
+
+To determine if a load instruction can receive all its data from the `store_buffer_`, a more thorough check is performed. This mechanism is crucial for performance, as it can prevent stalls by bypassing cache access if data is available more locally.
+
+The process involves the following steps:
+
+1.  *Initialization*:
+    * The load's target virtual address (`load_addr`) and its size (`load_size`) are obtained.
+    * A `coverage_mask` (typically a `std::vector<bool>` or a bitmask like `uint64_t` for smaller loads) is created with a size equal to `load_size`. Each element/bit corresponds to a byte of the load, initialized to indicate "not covered."
+    * A counter, `bytes_covered_count`, is initialized to zero.
+
+2.  *Store Buffer Iteration*:
+    * The `store_buffer_` is iterated from the youngest (most recently added) store to the oldest. This order is critical to ensure that if multiple stores write to the same byte, the data from the youngest store is considered for forwarding.
+    * For each store in the buffer:
+        * The store's virtual address (`store_addr`) and size (`store_size`) are retrieved.
+        * The overlapping memory region between the current load and the store is calculated:
+            * `overlap_start_addr = std::max(load_addr, store_addr)`
+            * `overlap_end_addr = std::min(load_addr + load_size, store_addr + store_size)`
+        * If `overlap_start_addr < overlap_end_addr`, an actual overlap exists.
+
+3.  *Updating Coverage*:
+    * For each byte within the calculated overlapping region:
+        * The byte's 0-based index relative to the `load_addr` (`load_byte_idx`) is determined.
+        * If the `coverage_mask` indicates that this `load_byte_idx` has *not* yet been covered by an even younger store (i.e., `!coverage_mask[load_byte_idx]`):
+            * The `coverage_mask` for `load_byte_idx` is marked as "covered" (e.g., set to `true`).
+            * `bytes_covered_count` is incremented.
+    * This ensures each byte of the load is counted only once towards being covered, and by the youngest store that provides it.
+
+4.  *Early Exit Optimization*:
+    * If, after processing any store, `bytes_covered_count` becomes equal to `load_size`, it means all bytes of the load are covered. The iteration through the `store_buffer_` can stop early, as no further stores need to be checked.
+
+5.  *Final Determination*:
+    * After iterating through the necessary stores, if `bytes_covered_count == load_size`, the load can be fully forwarded.
+    * In this case, the load's `MemoryAccessInfoPtr` is updated: `setDataReady(true)` and `setCacheState(MemoryAccessInfo::CacheState::HIT)`. The LSU can then proceed as if the data was instantaneously available, bypassing a full cache lookup for the data itself.
+    * If `bytes_covered_count < load_size`, the load cannot be fully forwarded from the store buffer and must proceed to the cache for the remaining (or all) data.
+
+This comprehensive check allows the LSU to forward data even if it's sourced from multiple (potentially partial) stores, significantly improving performance for common RAW (Read-After-Write) hazard scenarios.
+
+=== Multi-Pipeline Design
+
+The LSU implements multiple parallel pipelines through:
+
+1. *Pipeline Configuration*
+[source,cpp]
+----
+PARAMETER(uint32_t, num_pipelines, 2, "Number of load/store pipelines")
+std::vector<LoadStorePipeline> ldst_pipelines_;
+----
+
+2. *Pipeline Management*
+- Round-robin allocation
+- Independent progress tracking
+- Shared resource arbitration
+
+== Test Bench Description
+
+=== Basic Functionality Tests
+* Load/Store instruction handling
+* Address translation
+* Data forwarding correctness
+* Pipeline utilization
+
+=== Corner Cases
+* Pipeline stalls
+* Exception handling
+* Flush scenarios
+* Resource conflicts
+
+== Future Work
+
+1. Enhanced store buffer search algorithms
+2. Advanced pipeline scheduling
+3. Improved hazard detection
+4. Extended performance counters
+
+== References
+
+[1] RISC-V Specification
+[2] Olympia Core Architecture Document
+[3] Memory Consistency Model Specification
\ No newline at end of file
diff --git a/docs/design_document_template/media/LSU.png b/docs/design_document_template/media/LSU.png
new file mode 100644
index 00000000..c5e66f2f
Binary files /dev/null and b/docs/design_document_template/media/LSU.png differ
diff --git a/docs/lsu.md b/docs/lsu.md
index e0b1a95b..d0e67bb1 100644
--- a/docs/lsu.md
+++ b/docs/lsu.md
@@ -24,6 +24,8 @@ out_mmu_lookup_req   --> Output to DCache (Send a VA to PA address translation r
 
 `allow_speculative_load_exec` - Allow loads to proceed speculatively before all older store addresses are known.
 
+`allow_data_forwarding` - Allow loads to get data from store instead of cache, by pass mem look up.
+
 `replay_buffer_size` - Size of the replay buffer. Defaults to the same size of the LSU instruction queue.
 
 `replay_issue_delay` - Delay in cycles to replay the instruction.
diff --git a/test/core/lsu/Lsu_test.cpp b/test/core/lsu/Lsu_test.cpp
index 50e86c15..8ad1bebd 100644
--- a/test/core/lsu/Lsu_test.cpp
+++ b/test/core/lsu/Lsu_test.cpp
@@ -51,6 +51,7 @@ class olympia::LSUTester
         EXPECT_EQUAL(lsu.cache_read_stage_, 4);
         EXPECT_EQUAL(lsu.complete_stage_, 6);
     }
+
 };
 
 const char USAGE[] =
@@ -107,11 +108,50 @@ void runTest(int argc, char **argv)
     olympia::LSU *my_lsu = root_node->getChild("cpu.core0.lsu")->getResourceAs<olympia::LSU*>();
     olympia::LSUTester lsupipe_tester;
     lsupipe_tester.test_pipeline_stages(*my_lsu);
-    cls.runSimulator(&sim, 9);
-    lsupipe_tester.test_inst_issue(*my_lsu, 2); // Loads operand dependency meet
-    cls.runSimulator(&sim, 52);
-    lsupipe_tester.test_replay_issue_abort(*my_lsu, 3); // Loads operand dependency meet
-    cls.runSimulator(&sim);
+
+    if(my_lsu->allowDataForwardingEX()) {
+        // Data forwarding enabled case
+        std::cout << "allow data forwarding " <<  "\n";;
+        // First store
+        cls.runSimulator(&sim, 7);
+
+        // First load - should get data from store forwarding
+        auto start_cycle = my_lsu->getClock()->currentCycle();
+        cls.runSimulator(&sim, 3);
+        EXPECT_EQUAL(my_lsu->getClock()->currentCycle() - start_cycle, 3); // Fast path
+
+        // Second load - no matching store, goes to cache
+        start_cycle = my_lsu->getClock()->currentCycle();
+        cls.runSimulator(&sim, 7);
+        EXPECT_EQUAL(my_lsu->getClock()->currentCycle() - start_cycle, 7); // Cache access path
+
+        // Second store and load
+        cls.runSimulator(&sim, 47);
+        lsupipe_tester.test_replay_issue_abort(*my_lsu, 0);
+    }
+    else {
+        // Data forwarding disabled case
+
+        // First store
+        cls.runSimulator(&sim, 7);
+
+        // First load - must go to cache
+        auto start_cycle = my_lsu->getClock()->currentCycle();
+        cls.runSimulator(&sim, 7); // Takes longer, must access cache
+        EXPECT_EQUAL(my_lsu->getClock()->currentCycle() - start_cycle, 7);
+
+        // Second load - also must go to cache
+        start_cycle = my_lsu->getClock()->currentCycle();
+        cls.runSimulator(&sim, 7);
+        EXPECT_EQUAL(my_lsu->getClock()->currentCycle() - start_cycle, 7);
+
+        // Second store and load
+        cls.runSimulator(&sim, 47);
+        lsupipe_tester.test_replay_issue_abort(*my_lsu, 2);
+    }
+
+   // Final state
+   cls.runSimulator(&sim);
 }
 
 int main(int argc, char **argv)
diff --git a/test/core/lsu/raw.json b/test/core/lsu/raw.json
index 63242dac..5451707b 100644
--- a/test/core/lsu/raw.json
+++ b/test/core/lsu/raw.json
@@ -1,6 +1,6 @@
 [
     {
-        "mnemonic": "lw",
+        "mnemonic": "sw",
         "rs1": 5,
         "rs2": 6,
         "vaddr": "0xdeeebeef"
@@ -15,7 +15,7 @@
         "mnemonic": "lw",
         "rs1": 5,
         "rs2": 6,
-        "vaddr": "0xdeeebeef"
+        "vaddr": "0xdeebbeef"
     },
     {
         "mnemonic": "sw",
@@ -33,6 +33,6 @@
         "mnemonic": "lw",
         "rs1": 3,
         "rd":  4,
-        "vaddr" : "0xdeadbeef"
+        "vaddr" : "0xdeeebeef"
     }
 ]
\ No newline at end of file
diff --git a/test/sim/CMakeLists.txt b/test/sim/CMakeLists.txt
index ee44c64d..db87e070 100644
--- a/test/sim/CMakeLists.txt
+++ b/test/sim/CMakeLists.txt
@@ -91,6 +91,8 @@ list(APPEND test_params_list "top.cpu.core0.lsu.params.cache_lookup_stage_length
 list(APPEND test_params_list "top.cpu.core0.lsu.params.cache_read_stage_length 3")
 list(APPEND test_params_list "top.cpu.core0.lsu.params.allow_speculative_load_exec false")
 list(APPEND test_params_list "top.cpu.core0.lsu.params.allow_speculative_load_exec true")
+list(APPEND test_params_list "top.cpu.core0.lsu.params.allow_data_forwarding false")
+list(APPEND test_params_list "top.cpu.core0.lsu.params.allow_data_forwarding true")
 list(APPEND test_params_list "top.cpu.core0.lsu.params.replay_issue_delay 5")
 
 # Used to set a custom name for each test