diff --git a/coreblocks/cache/dcache.py b/coreblocks/cache/dcache.py
new file mode 100644
index 000000000..a367eec7d
--- /dev/null
+++ b/coreblocks/cache/dcache.py
@@ -0,0 +1,646 @@
+from amaranth import *
+from amaranth.lib.data import View
+import amaranth.lib.memory as memory
+from amaranth.utils import exact_log2
+
+from transactron.core import Priority, TModule
+from transactron import Method, def_method, Transaction
+from coreblocks.params import DCacheParameters
+from transactron.utils import assign
+from transactron.lib import *
+from transactron.lib import logging
+from transactron.lib.simultaneous import condition
+
+from coreblocks.cache.iface import CacheInterface, DataCacheRefillerInterface
+from coreblocks.peripherals.bus_adapter import BusMasterInterface
+from transactron.utils.transactron_helpers import make_layout
+
+__all__ = [
+    "DCache",
+    "DCacheBypass",
+]
+
+from coreblocks.interface.layouts import DCacheLayouts
+
+log = logging.HardwareLogger("backend.dcache")
+
+
+class DCacheBypass(Elaboratable, CacheInterface):
+    def __init__(self, layouts: DCacheLayouts, params: DCacheParameters, bus_master: BusMasterInterface) -> None:
+        self.layouts = layouts
+        self.params = params
+        self.bus_master = bus_master
+
+        self.issue_req = Method(i=layouts.issue_req)
+        self.accept_res = Method(o=layouts.accept_res)
+        self.flush = Method()
+
+        if params.word_width != bus_master.params.data_width:
+            raise ValueError("Data cache bypass word width must match bus data width.")
+        if bus_master.params.granularity != 8:
+            raise ValueError("Data cache bypass expects byte-granular bus selects.")
+
+    def elaborate(self, platform):
+        m = TModule()
+
+        m.submodules.store_fifo = store_fifo = BasicFifo([("store", 1)], self.params.request_depth)
+
+        @def_method(m, self.issue_req)
+        def _(addr: Value, data: Value, byte_mask: Value, store: Value):
+            bus_addr = addr >> exact_log2(self.params.word_width_bytes)
+
+            with condition(m) as branch:
+                with branch(store):
+                    self.bus_master.request_write(m, addr=bus_addr, data=data, sel=byte_mask)
+                with branch():
+                    self.bus_master.request_read(m, addr=bus_addr, sel=byte_mask)
+
+            store_fifo.write(m, store=store)
+
+        @def_method(m, self.accept_res)
+        def _():
+            request = store_fifo.read(m)
+            data = Signal(self.params.word_width)
+            error = Signal()
+
+            with condition(m) as branch:
+                with branch(request.store):
+                    res = self.bus_master.get_write_response(m)
+                    m.d.comb += error.eq(res.err)
+                with branch():
+                    res = self.bus_master.get_read_response(m)
+                    m.d.comb += [
+                        data.eq(res.data),
+                        error.eq(res.err),
+                    ]
+
+            return {"data": data, "error": error}
+
+        @def_method(m, self.flush)
+        def _() -> None:
+            pass
+
+        return m
+
+
+class DCache(Elaboratable, CacheInterface):
+    """A simple write-back data cache
+
+    Single-core design — no coherence protocol.
+
+    Replacement policy: pseudo-random round-robin (same as ICache).
+    """
+
+    def __init__(self, layouts: DCacheLayouts, params, refiller: DataCacheRefillerInterface) -> None:
+        """
+        Parameters
+        ----------
+        layouts : DCacheLayouts
+            Layouts for D-cache methods.
+        params : DCacheParameters
+            Cache geometry and configuration.
+        refiller : CacheRefillerInterface
+            Refiller with writeback support for dirty line eviction.
+        """
+        self.layouts = layouts
+        self.params = params
+        self.refiller = refiller
+
+        # Methods
+        self.issue_req = Method(i=layouts.issue_req)
+        self.accept_res = Method(o=layouts.accept_res)
+        self.issue_req.add_conflict(self.accept_res, Priority.LEFT)
+
+        self.flush = Method()
+        self.flush.add_conflict(self.issue_req, Priority.LEFT)
+
+        # Method called by refiller AFTER cache started writeback (called start_writeback method in refiller)
+        self.provide_writeback_data = Method(o=layouts.provide_writeback_data)
+
+        self.addr_layout = make_layout(
+            ("offset", self.params.offset_bits),
+            ("index", self.params.index_bits),
+            ("tag", self.params.tag_bits),
+        )
+
+        self.perf_loads = HwCounter("backend.dcache.loads")
+        self.perf_stores = HwCounter("backend.dcache.stores")
+        self.perf_hits = HwCounter("backend.dcache.hits")
+        self.perf_misses = HwCounter("backend.dcache.misses")
+        self.perf_writebacks = HwCounter("backend.dcache.writebacks")
+        self.perf_errors = HwCounter("backend.dcache.errors")
+        self.perf_flushes = HwCounter("backend.dcache.flushes")
+
+    def deserialize_addr(self, raw_addr: Value) -> dict[str, Value]:
+        return {
+            "offset": raw_addr[: self.params.offset_bits],
+            "index": raw_addr[self.params.index_start_bit : self.params.index_end_bit + 1],
+            "tag": raw_addr[-self.params.tag_bits :],
+        }
+
+    def serialize_addr(self, addr: View) -> Value:
+        return Cat(addr.offset, addr.index, addr.tag)
+
+    def elaborate(self, platform):
+        m = TModule()
+
+        m.submodules += [
+            self.perf_loads,
+            self.perf_stores,
+            self.perf_hits,
+            self.perf_misses,
+            self.perf_writebacks,
+            self.perf_errors,
+            self.perf_flushes,
+        ]
+
+        m.submodules.mem = self.mem = DCacheMemory(self.params)
+        m.submodules.req_fifo = req_fifo = BasicFifo(self.layouts.issue_req, self.params.request_depth)
+        m.submodules.res_fifo = res_fifo = BasicFifo(self.layouts.accept_res, self.params.request_depth)
+
+        rr_way = Signal(range(self.params.num_of_ways))  # Round-robin state
+        rr_used = Signal()
+        outstanding = Signal(range(self.params.request_depth + 1))
+
+        flush_start = Signal()
+        flush_finish = Signal()
+        needs_writeback = Signal()  # if we missed and victim is dirty, writeback the victim and load new line
+        needs_refill = Signal()
+        refill_finish = Signal()
+        writeback_done_error = Signal()
+        writeback_done_flush = Signal()
+        writeback_done_refill = Signal()
+
+        wb_way = Signal(range(self.params.num_of_ways))
+        wb_index = Signal(self.params.index_bits)
+        wb_word_counter = Signal(range(self.params.words_in_line))
+
+        wb_triggered_by_flush = Signal()
+        flush_writeback_pending = Signal()
+
+        # Request/response communication (lookup)
+        pending_req = Signal(self.layouts.issue_req)
+        pending_req_valid = Signal()
+        lookup_addr = Signal(self.addr_layout)
+        lookup_valid = Signal()
+
+        # Refill state
+        refill_addr = Signal(self.addr_layout)
+        refill_way = Signal(range(self.params.num_of_ways))
+        refill_error = Signal()
+
+        with m.FSM(init="FLUSH") as fsm:
+            with m.State("FLUSH"):
+                with m.If(flush_writeback_pending):
+                    m.next = "WRITEBACK"
+                with m.Elif(flush_finish):
+                    m.next = "LOOKUP"
+
+            with m.State("LOOKUP"):
+                with m.If(flush_start):
+                    m.next = "FLUSH"
+                with m.Elif(needs_writeback):
+                    m.next = "WRITEBACK"
+                with m.Elif(needs_refill):
+                    m.next = "REFILL"
+
+            with m.State("REFILL"):
+                with m.If(refill_finish):
+                    m.next = "LOOKUP"
+
+            with m.State("WRITEBACK"):
+                with m.If(writeback_done_error):
+                    with m.If(wb_triggered_by_flush):
+                        m.d.sync += [
+                            wb_triggered_by_flush.eq(0),
+                            flush_writeback_pending.eq(0),
+                        ]
+                    m.next = "LOOKUP"
+                with m.Elif(writeback_done_flush):
+                    m.d.sync += [
+                        wb_triggered_by_flush.eq(0),
+                        flush_writeback_pending.eq(0),
+                    ]
+                    m.next = "FLUSH"
+                with m.Elif(writeback_done_refill):
+                    m.next = "REFILL"
+
+        with m.If(fsm.ongoing("LOOKUP") & pending_req_valid & ~lookup_valid):
+            m.d.sync += lookup_valid.eq(1)
+
+        with Transaction(name="StartLookupFromQueue").body(
+            m, ready=fsm.ongoing("LOOKUP") & ~pending_req_valid & ~lookup_valid
+        ):
+            req = req_fifo.read(m)
+            deserialized = self.deserialize_addr(req.addr)
+            m.d.sync += [
+                pending_req.addr.eq(req.addr),
+                pending_req.data.eq(req.data),
+                pending_req.byte_mask.eq(req.byte_mask),
+                pending_req.store.eq(req.store),
+                pending_req_valid.eq(1),
+                assign(lookup_addr, deserialized),
+                lookup_valid.eq(0),
+            ]
+
+        # ------------- FLUSH -------
+        # Iterates through all sets, checks dirty bits, starts writeback if needed, then invalidates.
+        # 2 cycles per set: cycle 1 = SRAM read (wait), cycle 2 = process tag_rd_data.
+        # TODO: optimize to 1 cycle per set
+
+        flush_index = Signal(self.params.index_bits)
+        flush_data_valid = Signal()  # high when tag_rd_data is valid for current flush_index
+
+        # SRAM read latency: wait 1 cycle for data
+        with m.If(fsm.ongoing("FLUSH") & ~flush_data_valid):
+            m.d.sync += flush_data_valid.eq(1)
+
+        with m.If(~fsm.ongoing("FLUSH")):
+            m.d.sync += flush_data_valid.eq(0)
+
+        # Connect SRAM read address to flush_index during FLUSH
+        with m.If(fsm.ongoing("FLUSH")):
+            m.d.comb += self.mem.tag_rd_index.eq(flush_index)
+
+        @def_method(
+            m,
+            self.flush,
+            ready=fsm.ongoing("LOOKUP") & (outstanding == 0) & ~pending_req_valid & ~lookup_valid,
+        )
+        def _():
+            log.info(m, True, "Flushing the cache...")
+            m.d.sync += flush_index.eq(0)
+            m.d.sync += flush_data_valid.eq(0)
+            m.d.comb += flush_start.eq(1)
+
+        with Transaction(name="Flush").body(m, ready=fsm.ongoing("FLUSH") & flush_data_valid):
+            # tag_rd_data is valid for current flush_index
+
+            any_dirty = Signal()
+            dirty_way = Signal(range(self.params.num_of_ways))
+
+            for i in range(self.params.num_of_ways):
+                tag_data = self.mem.tag_rd_data[i]
+                with m.If(tag_data.valid & tag_data.dirty):
+                    m.d.comb += any_dirty.eq(1)
+                    m.d.comb += dirty_way.eq(i)
+
+            with m.If(any_dirty):
+                # Writeback the dirty way, then come back to re-check this set
+                wb_addr = Signal(self.addr_layout)
+                m.d.comb += [
+                    wb_addr.offset.eq(0),
+                    wb_addr.index.eq(flush_index),
+                    wb_addr.tag.eq(self.mem.tag_rd_data[dirty_way].tag),
+                ]
+                self.refiller.start_writeback(m, addr=self.serialize_addr(wb_addr))
+                m.d.sync += [
+                    wb_way.eq(dirty_way),
+                    wb_index.eq(flush_index),
+                    wb_word_counter.eq(0),
+                    wb_triggered_by_flush.eq(1),
+                    flush_writeback_pending.eq(1),
+                ]
+            with m.Else():
+                # No dirty ways — invalidate all ways at this set
+                m.d.comb += [
+                    self.mem.way_wr_en.eq(C(1).replicate(self.params.num_of_ways)),
+                    self.mem.tag_wr_index.eq(flush_index),
+                    self.mem.tag_wr_data.valid.eq(0),
+                    self.mem.tag_wr_data.dirty.eq(0),
+                    self.mem.tag_wr_data.tag.eq(0),
+                    self.mem.tag_wr_en.eq(1),
+                ]
+
+                with m.If(flush_index == self.params.num_of_sets - 1):
+                    m.d.comb += flush_finish.eq(1)
+                with m.Else():
+                    m.d.sync += flush_index.eq(flush_index + 1)
+                    m.d.sync += flush_data_valid.eq(0)
+
+        # ------------------ WRITEBACK ---
+
+        with m.If(fsm.ongoing("WRITEBACK")):
+            word_bytes_log = exact_log2(self.params.word_width_bytes)
+            m.d.comb += [
+                self.mem.data_rd_addr.index.eq(wb_index),
+                self.mem.data_rd_addr.offset.eq(wb_word_counter << word_bytes_log),
+            ]
+
+        @def_method(m, self.provide_writeback_data, ready=fsm.ongoing("WRITEBACK"))
+        def _():
+            m.d.sync += wb_word_counter.eq(wb_word_counter + 1)
+            return {"data": self.mem.data_rd_data[wb_way]}
+
+        # End the writeback
+        # Runs if FSM is WRITEBACK and refiller.accept_writeback is ready
+        with Transaction(name="WritebackEnd").body(m, ready=fsm.ongoing("WRITEBACK")):
+            result = self.refiller.accept_writeback(m)
+            self.perf_errors.incr(m, enable_call=result.error)
+
+            with m.If(~result.error):
+                # After a successful writeback, the victim line can be invalidated.
+                m.d.comb += [
+                    self.mem.way_wr_en.eq(1 << wb_way),
+                    self.mem.tag_wr_index.eq(wb_index),
+                    self.mem.tag_wr_data.valid.eq(0),
+                    self.mem.tag_wr_data.dirty.eq(0),
+                    self.mem.tag_wr_data.tag.eq(0),
+                    self.mem.tag_wr_en.eq(1),
+                ]
+
+                with m.If(wb_triggered_by_flush):
+                    m.d.comb += writeback_done_flush.eq(1)
+                with m.Else():
+                    self.refiller.start_refill(m, addr=self.serialize_addr(refill_addr))
+                    m.d.comb += writeback_done_refill.eq(1)
+
+            with m.Else():
+                m.d.comb += writeback_done_error.eq(1)
+                with m.If(~wb_triggered_by_flush):
+                    res_fifo.write(m, data=0, error=1)
+                    m.d.sync += [
+                        pending_req_valid.eq(0),
+                        lookup_valid.eq(0),
+                        rr_used.eq(0),
+                    ]
+
+        # Writeback is started either by lookup or flush
+
+        # ---------- LOOKUP ----
+
+        with Transaction(name="Lookup").body(m, ready=fsm.ongoing("LOOKUP") & pending_req_valid & lookup_valid):
+            # If cache hit: set dirty bit if store -> return data
+            # If cache miss: if victim has dirty bit, writeback cache line -> refill -> return data
+            tag_hit = Array(
+                self.mem.tag_rd_data[i].valid & (self.mem.tag_rd_data[i].tag == lookup_addr.tag)
+                for i in range(self.params.num_of_ways)
+            )
+
+            tag_hit_any = Signal()
+            m.d.comb += tag_hit_any.eq(Cat(tag_hit).any())
+
+            hit_way = Signal(range(self.params.num_of_ways))
+            load_data = Signal(self.params.word_width)
+
+            # TODO: optimize that
+            for i in range(self.params.num_of_ways):
+                with m.If(tag_hit[i]):
+                    m.d.comb += [
+                        hit_way.eq(i),
+                        load_data.eq(self.mem.data_rd_data[i]),
+                    ]
+
+            with m.If(tag_hit_any):
+                response_data = Signal(self.params.word_width)
+                m.d.comb += response_data.eq(load_data)
+                with m.If(pending_req.store):
+                    m.d.comb += [
+                        self.mem.way_wr_en.eq(1 << hit_way),
+                        self.mem.data_wr_en.eq(1),
+                        self.mem.data_wr_addr.index.eq(lookup_addr.index),
+                        self.mem.data_wr_addr.offset.eq(lookup_addr.offset),
+                        self.mem.data_wr_data.eq(pending_req.data),
+                        self.mem.data_wr_mask.eq(pending_req.byte_mask),
+                        self.mem.tag_wr_index.eq(lookup_addr.index),
+                        self.mem.tag_wr_data.valid.eq(1),
+                        self.mem.tag_wr_data.dirty.eq(1),
+                        self.mem.tag_wr_data.tag.eq(lookup_addr.tag),
+                        self.mem.tag_wr_en.eq(1),
+                    ]
+                    m.d.comb += response_data.eq(0)
+
+                res_fifo.write(m, data=response_data, error=0)
+
+                m.d.sync += [
+                    pending_req_valid.eq(0),
+                    lookup_valid.eq(0),
+                ]
+
+            with m.Else():
+                # we check if dirty, if yes, change FSM to writeback
+                # then, we refill
+                # then, lookup transaction starts again, now with proper refilled cache line
+
+                # 1. Choose way determined by round-robin
+                victim_way = Signal(range(self.params.num_of_ways))
+                victim_used_rr = Signal()
+                m.d.comb += [victim_way.eq(rr_way), victim_used_rr.eq(1)]
+
+                # 2. If there are already some invalid ways, use theam instead of round-robin
+                for i in reversed(range(self.params.num_of_ways)):
+                    with m.If(~self.mem.tag_rd_data[i].valid):
+                        m.d.comb += [victim_way.eq(i), victim_used_rr.eq(0)]
+
+                m.d.sync += [rr_used.eq(victim_used_rr)]
+
+                victim_tag_data = self.mem.tag_rd_data[victim_way]
+                victim_addr = Signal(self.addr_layout)
+                m.d.comb += [
+                    victim_addr.offset.eq(0),
+                    victim_addr.index.eq(lookup_addr.index),
+                    victim_addr.tag.eq(victim_tag_data.tag),
+                ]
+
+                aligned_refill_addr = self.serialize_addr(lookup_addr) & ~((1 << self.params.offset_bits) - 1)
+
+                with m.If(victim_tag_data.valid & victim_tag_data.dirty):
+                    # Writeback, then Refill
+                    self.refiller.start_writeback(m, addr=self.serialize_addr(victim_addr))
+                    m.d.comb += needs_writeback.eq(1)
+                    m.d.sync += [
+                        wb_way.eq(victim_way),
+                        wb_index.eq(lookup_addr.index),
+                        wb_word_counter.eq(0),
+                        wb_triggered_by_flush.eq(0),
+                        refill_addr.offset.eq(0),
+                        refill_addr.index.eq(lookup_addr.index),
+                        refill_addr.tag.eq(lookup_addr.tag),
+                        refill_way.eq(victim_way),
+                        refill_error.eq(0),
+                        lookup_valid.eq(0),
+                    ]
+                with m.Else():
+                    # Refill
+                    self.refiller.start_refill(m, aligned_refill_addr)
+                    m.d.comb += needs_refill.eq(1)
+                    m.d.sync += [
+                        refill_addr.offset.eq(0),
+                        refill_addr.index.eq(lookup_addr.index),
+                        refill_addr.tag.eq(lookup_addr.tag),
+                        refill_way.eq(victim_way),
+                        refill_error.eq(0),
+                        lookup_valid.eq(0),
+                    ]
+
+        # ------------- REFILL ---------
+        with Transaction(name="Refill").body(m, ready=fsm.ongoing("REFILL")):
+            ret = self.refiller.accept_refill(m)
+            deserialized = self.deserialize_addr(ret.addr)
+            refill_error_now = Signal()
+            m.d.comb += refill_error_now.eq(refill_error | ret.error)
+
+            with m.If(~ret.error):
+                m.d.comb += [
+                    self.mem.way_wr_en.eq(1 << refill_way),
+                    self.mem.data_wr_en.eq(1),
+                    self.mem.data_wr_addr.index.eq(deserialized["index"]),
+                    self.mem.data_wr_addr.offset.eq(deserialized["offset"]),
+                    self.mem.data_wr_data.eq(ret.data),
+                    self.mem.data_wr_mask.eq((1 << self.params.word_width_bytes) - 1),
+                ]
+
+            with m.If(ret.error):
+                m.d.sync += refill_error.eq(1)
+
+            with m.If(ret.last):
+                m.d.comb += refill_finish.eq(1)
+                with m.If(~refill_error_now):
+                    m.d.comb += [
+                        self.mem.way_wr_en.eq(1 << refill_way),
+                        self.mem.tag_wr_index.eq(refill_addr.index),
+                        self.mem.tag_wr_data.valid.eq(1),
+                        self.mem.tag_wr_data.dirty.eq(0),
+                        self.mem.tag_wr_data.tag.eq(refill_addr.tag),
+                        self.mem.tag_wr_en.eq(1),
+                    ]
+                    m.d.sync += [
+                        lookup_valid.eq(0),
+                        refill_error.eq(0),
+                        rr_used.eq(0),
+                    ]
+                    # move
+                    with m.If(rr_used):
+                        m.d.sync += [
+                            rr_way.eq(Mux(refill_way == self.params.num_of_ways - 1, 0, refill_way + 1)),
+                        ]
+
+                with m.Else():
+                    res_fifo.write(m, data=0, error=1)
+                    m.d.sync += [
+                        pending_req_valid.eq(0),
+                        lookup_valid.eq(0),
+                        refill_error.eq(0),
+                        rr_used.eq(0),
+                    ]
+
+        # ------ Methods ---
+        @def_method(m, self.accept_res)
+        def _():
+            m.d.sync += outstanding.eq(outstanding - 1)
+            return res_fifo.read(m)
+
+        @def_method(
+            m,
+            self.issue_req,
+            ready=outstanding != self.params.request_depth,
+        )
+        def _(addr: Value, data: Value, byte_mask: Value, store: Value):
+            with m.If(store):
+                self.perf_stores.incr(m)
+            with m.Else():
+                self.perf_loads.incr(m)
+
+            req_fifo.write(m, addr=addr, data=data, byte_mask=byte_mask, store=store)
+            m.d.sync += outstanding.eq(outstanding + 1)
+
+        # Connection to memory
+        with m.If(fsm.ongoing("FLUSH")):
+            m.d.comb += self.mem.tag_rd_index.eq(flush_index)
+        with m.Elif(fsm.ongoing("WRITEBACK")):
+            m.d.comb += [
+                self.mem.data_rd_addr.index.eq(wb_index),
+                self.mem.data_rd_addr.offset.eq(wb_word_counter << exact_log2(self.params.word_width_bytes)),
+            ]
+        with m.Else():
+            m.d.comb += [
+                self.mem.tag_rd_index.eq(lookup_addr.index),
+                self.mem.data_rd_addr.index.eq(lookup_addr.index),
+                self.mem.data_rd_addr.offset.eq(lookup_addr.offset),
+            ]
+
+        return m
+
+
+class DCacheMemory(Elaboratable):
+    """A helper module for managing memories used in the data cache.
+
+    Extends the ICache memory design with:
+    - A dirty bit in the tag array (for write-back policy).
+    - Byte-granularity write enables on the data array (for sb/sh/sw).
+    """
+
+    def __init__(self, params: DCacheParameters) -> None:
+        self.params = params
+
+        # Dirty bit - if set, it means cache was modified since loading from ram; we need to save the cache on flush
+        self.tag_data_layout = make_layout(("valid", 1), ("dirty", 1), ("tag", self.params.tag_bits))
+
+        self.way_wr_en = Signal(self.params.num_of_ways)
+
+        self.tag_rd_index = Signal(self.params.index_bits)
+        self.tag_rd_data = Array([Signal(self.tag_data_layout) for _ in range(self.params.num_of_ways)])
+        self.tag_wr_index = Signal(self.params.index_bits)
+        self.tag_wr_en = Signal()
+        self.tag_wr_data = Signal(self.tag_data_layout)
+
+        self.data_addr_layout = make_layout(("index", self.params.index_bits), ("offset", self.params.offset_bits))
+
+        self.word_bits = params.word_width
+        self.word_bytes = params.word_width // 8
+
+        self.data_rd_addr = Signal(self.data_addr_layout)
+        self.data_rd_data = Array([Signal(self.word_bits) for _ in range(self.params.num_of_ways)])
+
+        self.data_wr_addr = Signal(self.data_addr_layout)
+        self.data_wr_en = Signal()
+        self.data_wr_data = Signal(self.word_bits)
+
+        self.data_wr_mask = Signal(self.word_bytes)  # byte-granularity write mask
+        self.tag_mems: list[memory.Memory] = []
+        self.data_mems: list[memory.Memory] = []
+
+    def elaborate(self, platform):
+        m = TModule()
+
+        for i in range(self.params.num_of_ways):
+            way_wr = self.way_wr_en[i]
+
+            tag_mem = memory.Memory(shape=self.tag_data_layout, depth=self.params.num_of_sets, init=[])
+            self.tag_mems.append(tag_mem)
+
+            tag_mem_wp = tag_mem.write_port()
+            tag_mem_rp = tag_mem.read_port(transparent_for=[tag_mem_wp])
+            m.submodules[f"tag_mem_{i}"] = tag_mem
+
+            m.d.comb += [
+                assign(self.tag_rd_data[i], tag_mem_rp.data),
+                tag_mem_rp.addr.eq(self.tag_rd_index),
+                tag_mem_wp.addr.eq(self.tag_wr_index),
+                assign(tag_mem_wp.data, self.tag_wr_data),
+                tag_mem_wp.en.eq(self.tag_wr_en & way_wr),
+            ]
+
+            data_mem = memory.Memory(
+                shape=self.word_bits,
+                depth=self.params.num_of_sets * self.params.words_in_line,
+                init=[],
+            )
+            self.data_mems.append(data_mem)
+            data_mem_wp = data_mem.write_port(granularity=8)
+            data_mem_rp = data_mem.read_port(transparent_for=[data_mem_wp])
+            m.submodules[f"data_mem_{i}"] = data_mem
+
+            word_bytes_log = exact_log2(self.word_bytes)
+            rd_addr = Cat(self.data_rd_addr.offset, self.data_rd_addr.index)[word_bytes_log:]
+            wr_addr = Cat(self.data_wr_addr.offset, self.data_wr_addr.index)[word_bytes_log:]
+
+            m.d.comb += [
+                self.data_rd_data[i].eq(data_mem_rp.data),
+                data_mem_rp.addr.eq(rd_addr),
+                data_mem_wp.addr.eq(wr_addr),
+                Value.cast(data_mem_wp.data).eq(self.data_wr_data),
+                data_mem_wp.en.eq(Mux(self.data_wr_en & way_wr, self.data_wr_mask, 0)),
+            ]
+
+        return m
diff --git a/coreblocks/cache/iface.py b/coreblocks/cache/iface.py
index 132c92b7e..fcbd5eacd 100644
--- a/coreblocks/cache/iface.py
+++ b/coreblocks/cache/iface.py
@@ -4,7 +4,7 @@
 
 from amaranth_types import HasElaborate
 
-__all__ = ["CacheInterface", "CacheRefillerInterface"]
+__all__ = ["CacheInterface", "CacheRefillerInterface", "DataCacheRefillerInterface"]
 
 
 class CacheInterface(HasElaborate, Protocol):
@@ -40,3 +40,23 @@ class CacheRefillerInterface(HasElaborate, Protocol):
 
     start_refill: Method
     accept_refill: Method
+
+
+class DataCacheRefillerInterface(CacheRefillerInterface):
+    """
+    Data Cache Refiller Interface.
+
+    Parameters
+    ----------
+    start_refill: Method
+        A method that is used to start a refill for a given cache line.
+    accept_refill: Method
+        A method that is used to accept one fetch block from the requested cache line.
+    start_writeback: Method
+        Writes dirty data from cache to memory
+    accept_writeback: Method
+        Accepts writeback result
+    """
+
+    start_writeback: Method
+    accept_writeback: Method
diff --git a/coreblocks/cache/refiller.py b/coreblocks/cache/refiller.py
index 53117867c..cf921aa1c 100644
--- a/coreblocks/cache/refiller.py
+++ b/coreblocks/cache/refiller.py
@@ -1,7 +1,7 @@
 from amaranth import *
-from coreblocks.cache.icache import CacheRefillerInterface
-from coreblocks.params import ICacheParameters
-from coreblocks.interface.layouts import ICacheLayouts
+from coreblocks.cache.iface import CacheRefillerInterface, DataCacheRefillerInterface
+from coreblocks.params import ICacheParameters, DCacheParameters
+from coreblocks.interface.layouts import ICacheLayouts, DCacheLayouts
 from coreblocks.peripherals.bus_adapter import BusMasterInterface
 from transactron.core import Transaction, Method, TModule, def_method
 from transactron.lib import Forwarder
@@ -9,7 +9,7 @@
 from amaranth.utils import exact_log2
 
 
-__all__ = ["SimpleCommonBusCacheRefiller"]
+__all__ = ["SimpleCommonBusCacheRefiller", "SimpleCommonBusDataCacheRefiller"]
 
 
 class SimpleCommonBusCacheRefiller(Elaboratable, CacheRefillerInterface):
@@ -105,3 +105,149 @@ def _():
             return resp_fwd.read(m)
 
         return m
+
+
+class SimpleCommonBusDataCacheRefiller(Elaboratable, DataCacheRefillerInterface):
+    def __init__(self, layouts: DCacheLayouts, params: DCacheParameters, bus_master: BusMasterInterface):
+        if params.word_width != bus_master.params.data_width:
+            raise ValueError("Data cache word width must match bus data width.")
+        if bus_master.params.granularity != 8:
+            raise ValueError("Data cache refiller expects byte-granular bus selects.")
+
+        self.layouts = layouts
+        self.params = params
+        self.bus_master = bus_master
+
+        self.start_refill = Method(i=layouts.start_refill)
+        self.accept_refill = Method(o=layouts.accept_refill)
+
+        self.start_writeback = Method(i=layouts.start_writeback)
+        self.accept_writeback = Method(o=layouts.accept_writeback)
+        self.get_writeback_data = Method(o=layouts.provide_writeback_data)
+
+    def elaborate(self, platform):
+        m = TModule()
+
+        m.submodules.refill_resp_fwd = refill_resp_fwd = Forwarder(self.layouts.accept_refill)
+        m.submodules.writeback_resp_fwd = writeback_resp_fwd = Forwarder(self.layouts.accept_writeback)
+
+        line_addr = Signal(self.params.addr_width - self.params.offset_bits)
+        word_idx = Signal(range(self.params.words_in_line))
+        writeback_error = Signal()
+
+        word_bytes_log = exact_log2(self.params.word_width_bytes)
+        full_sel = C(1).replicate(self.bus_master.params.data_width // self.bus_master.params.granularity)
+        bus_word_addr = Cat(word_idx, line_addr)
+        byte_word_addr = Cat(C(0, word_bytes_log), word_idx, line_addr)
+        last_word = word_idx == self.params.words_in_line - 1
+
+        start_refill_req = Signal()
+        start_writeback_req = Signal()
+        bus_read_request_done = Signal()
+        bus_read_done = Signal()
+        bus_read_error = Signal()
+        bus_write_request_done = Signal()
+        bus_write_done = Signal()
+
+        with m.FSM(init="IDLE") as fsm:
+            with m.State("IDLE"):
+                with m.If(start_refill_req):
+                    m.next = "REFILL_REQ"
+                with m.Elif(start_writeback_req):
+                    m.next = "WRITEBACK_REQ"
+
+            with m.State("REFILL_REQ"):
+                with m.If(bus_read_request_done):
+                    m.next = "REFILL_RESP"
+
+            with m.State("REFILL_RESP"):
+                with m.If(bus_read_done):
+                    with m.If(bus_read_error | last_word):
+                        m.next = "IDLE"
+                    with m.Else():
+                        m.next = "REFILL_REQ"
+
+            with m.State("WRITEBACK_REQ"):
+                with m.If(bus_write_request_done):
+                    m.next = "WRITEBACK_RESP"
+
+            with m.State("WRITEBACK_RESP"):
+                with m.If(bus_write_done):
+                    with m.If(last_word):
+                        m.next = "IDLE"
+                    with m.Else():
+                        m.next = "WRITEBACK_REQ"
+
+        with Transaction(name="DCacheRefillRequest").body(m, ready=fsm.ongoing("REFILL_REQ")):
+            self.bus_master.request_read(
+                m,
+                addr=bus_word_addr,
+                sel=full_sel,
+            )
+            m.d.comb += bus_read_request_done.eq(1)
+
+        with Transaction(name="DCacheRefillResponse").body(m, ready=fsm.ongoing("REFILL_RESP")):
+            bus_response = self.bus_master.get_read_response(m)
+            m.d.comb += [
+                bus_read_done.eq(1),
+                bus_read_error.eq(bus_response.err),
+            ]
+            refill_resp_fwd.write(
+                m,
+                addr=byte_word_addr,
+                data=bus_response.data,
+                error=bus_response.err,
+                last=bus_response.err | last_word,
+            )
+
+            with m.If(~bus_response.err & ~last_word):
+                m.d.sync += word_idx.eq(word_idx + 1)
+
+        with Transaction(name="DCacheWritebackRequest").body(m, ready=fsm.ongoing("WRITEBACK_REQ")):
+            data = self.get_writeback_data(m)
+            self.bus_master.request_write(
+                m,
+                addr=bus_word_addr,
+                data=data.data,
+                sel=full_sel,
+            )
+            m.d.comb += bus_write_request_done.eq(1)
+
+        with Transaction(name="DCacheWritebackResponse").body(m, ready=fsm.ongoing("WRITEBACK_RESP")):
+            bus_response = self.bus_master.get_write_response(m)
+            m.d.comb += bus_write_done.eq(1)
+
+            with m.If(last_word):
+                writeback_resp_fwd.write(m, error=writeback_error | bus_response.err)
+            with m.Else():
+                m.d.sync += [
+                    writeback_error.eq(writeback_error | bus_response.err),
+                    word_idx.eq(word_idx + 1),
+                ]
+
+        @def_method(m, self.start_refill, ready=fsm.ongoing("IDLE"))
+        def _(addr) -> None:
+            m.d.comb += start_refill_req.eq(1)
+            m.d.sync += [
+                line_addr.eq(addr[self.params.offset_bits :]),
+                word_idx.eq(0),
+            ]
+
+        @def_method(m, self.accept_refill)
+        def _():
+            return refill_resp_fwd.read(m)
+
+        @def_method(m, self.start_writeback, ready=fsm.ongoing("IDLE"))
+        def _(addr) -> None:
+            m.d.comb += start_writeback_req.eq(1)
+            m.d.sync += [
+                line_addr.eq(addr[self.params.offset_bits :]),
+                word_idx.eq(0),
+                writeback_error.eq(0),
+            ]
+
+        @def_method(m, self.accept_writeback)
+        def _():
+            return writeback_resp_fwd.read(m)
+
+        return m
diff --git a/coreblocks/func_blocks/fu/lsu/dummyLsu.py b/coreblocks/func_blocks/fu/lsu/dummyLsu.py
index 83a9d64b1..ceb2ff787 100644
--- a/coreblocks/func_blocks/fu/lsu/dummyLsu.py
+++ b/coreblocks/func_blocks/fu/lsu/dummyLsu.py
@@ -2,6 +2,7 @@
 
 from amaranth import *
 from transactron import Method, TModule, Transaction, def_method
+from transactron.lib import BasicFifo
 from transactron.lib.connectors import FIFO
 from transactron.lib.logging import HardwareLogger
 from transactron.lib.simultaneous import condition
@@ -9,6 +10,9 @@
 from transactron.utils.transactron_helpers import make_layout
 
 from coreblocks.arch import OpType
+from coreblocks.cache.dcache import DCache, DCacheBypass
+from coreblocks.cache.iface import CacheInterface
+from coreblocks.cache.refiller import SimpleCommonBusDataCacheRefiller
 from coreblocks.arch.isa_consts import ExceptionCause
 from coreblocks.frontend.decoder import *
 from coreblocks.func_blocks.fu.lsu.lsu_requester import LSURequester
@@ -22,13 +26,64 @@
     ExceptionReportKey,
     InstructionPrecommitKey,
 )
-from coreblocks.interface.layouts import FuncUnitLayouts, LSULayouts
+from coreblocks.interface.layouts import DCacheLayouts, FuncUnitLayouts, LSULayouts
 from coreblocks.params import *
 from coreblocks.peripherals.bus_adapter import BusMasterInterface
 
 __all__ = ["LSUDummy", "LSUComponent"]
 
 
+class LSUDataPathRouter(Elaboratable, CacheInterface):
+    def __init__(self, gen_params: GenParams, cached_data_path: CacheInterface, mmio_data_path: CacheInterface) -> None:
+        self.gen_params = gen_params
+        layouts = gen_params.get(DCacheLayouts)
+
+        self.cached_data_path = cached_data_path
+        self.mmio_data_path = mmio_data_path
+
+        self.issue_req = Method(i=layouts.issue_req)
+        self.accept_res = Method(o=layouts.accept_res)
+        self.flush = Method()
+
+    def elaborate(self, platform):
+        m = TModule()
+
+        m.submodules.pma_checker = pma_checker = PMAChecker(self.gen_params)
+        m.submodules.mmio_fifo = mmio_fifo = BasicFifo([("mmio", 1)], self.gen_params.dcache_params.request_depth)
+
+        @def_method(m, self.issue_req)
+        def _(addr: Value, data: Value, byte_mask: Value, store: Value):
+            m.d.av_comb += pma_checker.addr.eq(addr)
+
+            with condition(m) as branch:
+                with branch(pma_checker.result["mmio"]):
+                    self.mmio_data_path.issue_req(m, addr=addr, data=data, byte_mask=byte_mask, store=store)
+                with branch():
+                    self.cached_data_path.issue_req(m, addr=addr, data=data, byte_mask=byte_mask, store=store)
+
+            mmio_fifo.write(m, mmio=pma_checker.result["mmio"])
+
+        @def_method(m, self.accept_res)
+        def _():
+            route = mmio_fifo.read(m)
+            response = Signal(self.gen_params.get(DCacheLayouts).accept_res)
+
+            with condition(m) as branch:
+                with branch(route.mmio):
+                    m.d.comb += response.eq(self.mmio_data_path.accept_res(m))
+                with branch():
+                    m.d.comb += response.eq(self.cached_data_path.accept_res(m))
+
+            return response
+
+        @def_method(m, self.flush)
+        def _() -> None:
+            self.cached_data_path.flush(m)
+            self.mmio_data_path.flush(m)
+
+        return m
+
+
 class LSUDummy(FuncUnit, Elaboratable):
     """
     Very simple LSU, which serializes all stores and loads.
@@ -77,7 +132,28 @@ def elaborate(self, platform):
         csr = self.dependency_manager.get_dependency(CSRInstancesKey())
         m.submodules.pma_checker = pma_checker = PMAChecker(self.gen_params)
         m.submodules.pmp_checker = pmp_checker = PMPChecker(self.gen_params, csr.m_mode)
-        m.submodules.requester = requester = LSURequester(self.gen_params, self.bus)
+
+        dcache_layouts = self.gen_params.get(DCacheLayouts)
+        if self.gen_params.dcache_params.enable:
+            m.submodules.dcache_refiller = dcache_refiller = SimpleCommonBusDataCacheRefiller(
+                dcache_layouts, self.gen_params.dcache_params, self.bus
+            )
+            m.submodules.cached_data_path = cached_data_path = DCache(
+                dcache_layouts, self.gen_params.dcache_params, dcache_refiller
+            )
+            dcache_refiller.get_writeback_data.provide(cached_data_path.provide_writeback_data)
+        else:
+            m.submodules.cached_data_path = cached_data_path = DCacheBypass(
+                dcache_layouts, self.gen_params.dcache_params, self.bus
+            )
+
+        m.submodules.mmio_data_path = mmio_data_path = DCacheBypass(
+            dcache_layouts, self.gen_params.dcache_params, self.bus
+        )
+        m.submodules.data_path_router = data_path_router = LSUDataPathRouter(
+            self.gen_params, cached_data_path, mmio_data_path
+        )
+        m.submodules.requester = requester = LSURequester(self.gen_params, data_path_router)
 
         request_layout = make_layout(
             ("data", self.fu_layouts.issue),
diff --git a/coreblocks/func_blocks/fu/lsu/lsu_requester.py b/coreblocks/func_blocks/fu/lsu/lsu_requester.py
index 86faa5085..188000d93 100644
--- a/coreblocks/func_blocks/fu/lsu/lsu_requester.py
+++ b/coreblocks/func_blocks/fu/lsu/lsu_requester.py
@@ -7,15 +7,14 @@
 
 from coreblocks.params import *
 from coreblocks.arch import Funct3, ExceptionCause
-from coreblocks.peripherals.bus_adapter import BusMasterInterface
+from coreblocks.cache.iface import CacheInterface
 from coreblocks.interface.layouts import LSULayouts
 
 
 class LSURequester(Elaboratable):
     """
-    Bus request logic for the load/store unit. Its job is to interface
-    between the LSU and the bus.
-
+    Memory request logic for the load/store unit. Its job is to interface
+    between the LSU and the data cache.
     Attributes
     ----------
     issue : Method
@@ -24,20 +23,20 @@ class LSURequester(Elaboratable):
         Retrieves a result from the bus.
     """
 
-    def __init__(self, gen_params: GenParams, bus: BusMasterInterface, depth: int = 4) -> None:
+    def __init__(self, gen_params: GenParams, cache: CacheInterface, depth: int = 4) -> None:
         """
         Parameters
         ----------
         gen_params : GenParams
             Parameters to be used during processor generation.
-        bus : BusMasterInterface
-            An instance of the bus master for interfacing with the data bus.
+        cache : CacheInterface
+            Data-cache-like interface used for memory requests.
         depth : int
             Number of requests which can be send to memory, before it provides first response. Describe
             the resiliency of `LSURequester` to latency of memory in case when memory is fully pipelined.
         """
         self.gen_params = gen_params
-        self.bus = bus
+        self.cache = cache
         self.depth = depth
 
         lsu_layouts = gen_params.get(LSULayouts)
@@ -48,7 +47,7 @@ def __init__(self, gen_params: GenParams, bus: BusMasterInterface, depth: int =
         self.log = HardwareLogger("backend.lsu.requester")
 
     def prepare_bytes_mask(self, m: ModuleLike, funct3: Value, addr: Value) -> Signal:
-        mask_len = self.gen_params.isa.xlen // self.bus.params.granularity
+        mask_len = self.gen_params.isa.xlen // 8
         mask = Signal(mask_len)
         with m.Switch(funct3):
             with m.Case(Funct3.B, Funct3.BU):
@@ -130,14 +129,11 @@ def _(addr: Value, data: Value, funct3: Value, store: Value):
             )
 
             with condition(m, nonblocking=True) as branch:
-                with branch(aligned & store):
-                    self.bus.request_write(m, addr=addr >> 2, data=bus_data, sel=bytes_mask)
-                with branch(aligned & ~store):
-                    self.bus.request_read(m, addr=addr >> 2, sel=bytes_mask)
-
-            with m.If(aligned):
-                args_fifo.write(m, addr=addr, funct3=funct3, store=store)
-            with m.Else():
+                with branch(aligned):
+                    self.cache.issue_req(m, addr=addr, data=bus_data, byte_mask=bytes_mask, store=store)
+                    args_fifo.write(m, addr=addr, funct3=funct3, store=store)
+
+            with m.If(~aligned):
                 m.d.av_comb += exception.eq(1)
                 m.d.av_comb += cause.eq(
                     Mux(store, ExceptionCause.STORE_ADDRESS_MISALIGNED, ExceptionCause.LOAD_ADDRESS_MISALIGNED)
@@ -155,16 +151,13 @@ def _():
             request_args = args_fifo.read(m)
             self.log.debug(m, 1, "accept data=0x{:08x} exception={} cause={}", data, exception, cause)
 
-            with condition(m) as branch:
-                with branch(request_args.store):
-                    fetched = self.bus.get_write_response(m)
-                    m.d.comb += err.eq(fetched.err)
-                with branch():
-                    fetched = self.bus.get_read_response(m)
-                    m.d.comb += err.eq(fetched.err)
-                    m.d.top_comb += data.eq(
-                        self.postprocess_load_data(m, request_args.funct3, fetched.data, request_args.addr)
-                    )
+            fetched = self.cache.accept_res(m)
+            m.d.comb += err.eq(fetched.error)
+
+            with m.If(~request_args.store):
+                m.d.top_comb += data.eq(
+                    self.postprocess_load_data(m, request_args.funct3, fetched.data, request_args.addr)
+                )
 
             with m.If(err):
                 m.d.av_comb += exception.eq(1)
diff --git a/coreblocks/interface/layouts.py b/coreblocks/interface/layouts.py
index d729d5943..78f647ab8 100644
--- a/coreblocks/interface/layouts.py
+++ b/coreblocks/interface/layouts.py
@@ -23,6 +23,7 @@
     "CSRRegisterLayouts",
     "CSRUnitLayouts",
     "ICacheLayouts",
+    "DCacheLayouts",
     "JumpBranchLayouts",
 ]
 
@@ -541,6 +542,51 @@ def __init__(self, gen_params: GenParams):
         )
 
 
+class DCacheLayouts:
+    """Layouts used in the data cache."""
+
+    def __init__(self, gen_params: GenParams):
+        fields = gen_params.get(CommonLayoutFields)
+
+        self.store: LayoutListField = ("store", 1)
+        """Request is a store operation."""
+
+        self.byte_mask: LayoutListField = ("byte_mask", gen_params.isa.xlen // 8)
+        """Byte-enable mask for stores. Each bit corresponds to one byte of the word."""
+
+        self.last: LayoutListField = ("last", 1)
+        """Last word in a cache line burst transfer."""
+
+        self.issue_req = make_layout(
+            fields.addr,
+            fields.data,
+            self.byte_mask,
+            self.store,
+        )
+
+        self.accept_res = make_layout(
+            fields.data,
+            fields.error,
+        )
+
+        self.start_refill = make_layout(fields.addr)
+
+        self.accept_refill = make_layout(
+            fields.addr,
+            fields.data,
+            fields.error,
+            self.last,
+        )
+
+        self.start_writeback = make_layout(fields.addr)
+
+        self.provide_writeback_data = make_layout(
+            fields.data,
+        )
+
+        self.accept_writeback = make_layout(fields.error)
+
+
 class FetchLayouts:
     """Layouts used in the fetcher."""
 
diff --git a/coreblocks/params/__init__.py b/coreblocks/params/__init__.py
index 594681832..2c448c1ae 100644
--- a/coreblocks/params/__init__.py
+++ b/coreblocks/params/__init__.py
@@ -1,5 +1,6 @@
 from .genparams import *  # noqa: F401
 from .fu_params import *  # noqa: F401
 from .icache_params import *  # noqa: F401
+from .dcache_params import *  # noqa: F401
 from .instr import *  # noqa: F401
 from .vmem_params import *  # noqa: F401
diff --git a/coreblocks/params/configurations.py b/coreblocks/params/configurations.py
index 0dc9f5929..af3863dc9 100644
--- a/coreblocks/params/configurations.py
+++ b/coreblocks/params/configurations.py
@@ -98,7 +98,17 @@ class _CoreConfigurationDataClass:
     icache_sets_bits: int
         Log of the number of sets of the instruction cache.
     icache_line_bytes_log: int
-        Log of the cache line size (in bytes).
+        Log of the instruction cache line size (in bytes).
+    dcache_enable: bool
+        Enable data cache. If disabled, requests are bypassed directly to the bus.
+    dcache_ways: int
+        Associativity of the data cache.
+    dcache_sets_bits: int
+        Log of the number of sets of the data cache.
+    dcache_line_bytes_log: int
+        Log of the data cache line size (in bytes).
+    dcache_request_depth: int
+        Number of requests accepted by the data cache interface.
     fetch_block_bytes_log: int
         Log of the size of the fetch block (in bytes).
     instr_buffer_size: int
@@ -169,6 +179,12 @@ def __post_init__(self):
     icache_sets_bits: int = 7
     icache_line_bytes_log: int = 5
 
+    dcache_enable: bool = True
+    dcache_ways: int = 2
+    dcache_sets_bits: int = 7
+    dcache_line_bytes_log: int = 5
+    dcache_request_depth: int = 4
+
     fetch_block_bytes_log: int = 2
 
     instr_buffer_size: int = 4
@@ -220,6 +236,7 @@ def replace(self, **kwargs) -> Self:
     phys_regs_bits=basic_core_config.phys_regs_bits - 1,
     rob_entries_bits=basic_core_config.rob_entries_bits - 1,
     icache_enable=False,
+    dcache_enable=False,
     user_mode=False,
     supervisor_mode=False,
 )
diff --git a/coreblocks/params/dcache_params.py b/coreblocks/params/dcache_params.py
new file mode 100644
index 000000000..8415bdec9
--- /dev/null
+++ b/coreblocks/params/dcache_params.py
@@ -0,0 +1,53 @@
+class DCacheParameters:
+    """Parameters of the Data Cache.
+
+    Parameters
+    ----------
+    addr_width : int
+        Length of addresses used in the cache (in bits).
+    word_width : int
+        Length of the machine word (in bits).
+    num_of_ways : int
+        Associativity of the cache.
+    num_of_sets_bits : int
+        Log of the number of cache sets.
+    line_bytes_log : int
+        Log of the size of a single cache line in bytes.
+    request_depth : int
+        Number of requests accepted by the public D-cache interface.
+    enable : bool
+        Enable the data cache. If disabled, requests are bypassed to the bus.
+    """
+
+    def __init__(
+        self,
+        *,
+        addr_width,
+        word_width,
+        num_of_ways,
+        num_of_sets_bits,
+        line_bytes_log,
+        request_depth=4,
+        enable=True,
+    ):
+        self.addr_width = addr_width
+        self.word_width = word_width
+        self.num_of_ways = num_of_ways
+        self.num_of_sets_bits = num_of_sets_bits
+        self.line_bytes_log = line_bytes_log
+        self.request_depth = request_depth
+        self.enable = enable
+
+        self.num_of_sets = 2**num_of_sets_bits
+        self.line_size_bytes = 2**line_bytes_log
+
+        self.word_width_bytes = word_width // 8
+
+        self.offset_bits = line_bytes_log
+        self.index_bits = num_of_sets_bits
+        self.tag_bits = self.addr_width - self.offset_bits - self.index_bits
+
+        self.index_start_bit = self.offset_bits
+        self.index_end_bit = self.offset_bits + self.index_bits - 1
+
+        self.words_in_line = self.line_size_bytes // self.word_width_bytes
diff --git a/coreblocks/params/genparams.py b/coreblocks/params/genparams.py
index 7a577190b..dd5ac65bc 100644
--- a/coreblocks/params/genparams.py
+++ b/coreblocks/params/genparams.py
@@ -4,6 +4,7 @@
 
 from coreblocks.arch.isa import ISA, gen_isa_string
 from .icache_params import ICacheParameters
+from .dcache_params import DCacheParameters
 from .vmem_params import VirtualMemoryParameters
 from .fu_params import extensions_supported
 from ..peripherals.wishbone import WishboneParameters
@@ -56,6 +57,16 @@ def __init__(self, cfg: CoreConfiguration):
             enable=cfg.icache_enable,
         )
 
+        self.dcache_params = DCacheParameters(
+            addr_width=self.isa.xlen,
+            word_width=self.isa.xlen,
+            num_of_ways=cfg.dcache_ways,
+            num_of_sets_bits=cfg.dcache_sets_bits,
+            line_bytes_log=cfg.dcache_line_bytes_log,
+            request_depth=cfg.dcache_request_depth,
+            enable=cfg.dcache_enable,
+        )
+
         self.debug_signals_enabled = cfg.debug_signals
 
         # Verification temporally disabled
diff --git a/test/cache/test_dcache.py b/test/cache/test_dcache.py
new file mode 100644
index 000000000..e20f7bc56
--- /dev/null
+++ b/test/cache/test_dcache.py
@@ -0,0 +1,1132 @@
+from collections import deque
+
+from amaranth import Elaboratable
+from amaranth.utils import exact_log2
+
+from transactron.lib import Adapter, AdapterTrans
+from transactron.testing import CallTrigger, TestCaseWithSimulator, TestbenchIO, def_method_mock, TestbenchContext
+from transactron.testing.method_mock import MethodMock
+from transactron.utils import ModuleConnector
+
+from coreblocks.cache.dcache import DCache, DCacheBypass
+from coreblocks.cache.refiller import SimpleCommonBusDataCacheRefiller
+from coreblocks.cache.iface import DataCacheRefillerInterface
+from coreblocks.interface.layouts import DCacheLayouts
+from coreblocks.params import GenParams
+from coreblocks.params.configurations import test_core_config
+
+from ..peripherals.bus_mock import BusMockParameters, MockMasterAdapter
+
+
+class SimpleCommonBusDataCacheRefillerTestCircuit(Elaboratable):
+    def __init__(self, gen_params: GenParams):
+        self.gen_params = gen_params
+        self.cp = self.gen_params.dcache_params
+
+    def elaborate(self, platform):
+        layouts = self.gen_params.get(DCacheLayouts)
+        bus_mock_params = BusMockParameters(
+            data_width=self.gen_params.isa.xlen,
+            addr_width=self.gen_params.wb_params.addr_width,
+        )
+
+        self.bus_master_adapter = MockMasterAdapter(bus_mock_params)
+        self.refiller = SimpleCommonBusDataCacheRefiller(layouts, self.cp, self.bus_master_adapter)
+
+        self.writeback_data_mock = TestbenchIO(Adapter(o=layouts.provide_writeback_data))
+        self.refiller.get_writeback_data.provide(self.writeback_data_mock.adapter.iface)
+
+        self.start_refill = TestbenchIO(AdapterTrans.create(self.refiller.start_refill))
+        self.accept_refill = TestbenchIO(AdapterTrans.create(self.refiller.accept_refill))
+        self.start_writeback = TestbenchIO(AdapterTrans.create(self.refiller.start_writeback))
+        self.accept_writeback = TestbenchIO(AdapterTrans.create(self.refiller.accept_writeback))
+
+        return ModuleConnector(
+            bus_master_adapter=self.bus_master_adapter,
+            refiller=self.refiller,
+            writeback_data_mock=self.writeback_data_mock,
+            start_refill=self.start_refill,
+            accept_refill=self.accept_refill,
+            start_writeback=self.start_writeback,
+            accept_writeback=self.accept_writeback,
+        )
+
+
+class TestSimpleCommonBusDataCacheRefiller(TestCaseWithSimulator):
+    def setup_method(self) -> None:
+        self.gen_params = GenParams(
+            test_core_config.replace(
+                xlen=32,
+                dcache_line_bytes_log=4,
+            )
+        )
+        self.cp = self.gen_params.dcache_params
+        self.m = SimpleCommonBusDataCacheRefillerTestCircuit(self.gen_params)
+        self.writeback_words = deque()
+
+    @def_method_mock(lambda self: self.m.writeback_data_mock, enable=lambda self: bool(self.writeback_words))
+    def writeback_data(self):
+        @MethodMock.effect
+        def eff():
+            self.writeback_words.popleft()
+
+        return {"data": self.writeback_words[0]}
+
+    def bus_word_addr(self, byte_addr: int, word_idx: int) -> int:
+        return (byte_addr >> exact_log2(self.cp.word_width_bytes)) + word_idx
+
+    def byte_word_addr(self, byte_addr: int, word_idx: int) -> int:
+        return byte_addr + word_idx * self.cp.word_width_bytes
+
+    def full_sel(self) -> int:
+        return (1 << self.cp.word_width_bytes) - 1
+
+    def test_refill_reads_full_line_and_emits_word_beats(self):
+        async def process(sim: TestbenchContext):
+            base_addr = 0x00000100
+            words = [0x10203040, 0x50607080, 0x90A0B0C0, 0xD0E0F000]
+
+            await self.m.start_refill.call(sim, addr=base_addr)
+
+            for word_idx, word in enumerate(words):
+                req = await self.m.bus_master_adapter.request_read_mock.call(sim)
+                assert req["addr"] == self.bus_word_addr(base_addr, word_idx)
+                assert req["sel"] == self.full_sel()
+
+                await self.m.bus_master_adapter.get_read_response_mock.call(sim, data=word, err=0)
+                resp = await self.m.accept_refill.call(sim)
+
+                assert resp["addr"] == self.byte_word_addr(base_addr, word_idx)
+                assert resp["data"] == word
+                assert resp["error"] == 0
+                assert resp["last"] == int(word_idx == len(words) - 1)
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(process)
+
+    def test_refill_error_returns_error_last_and_stops(self):
+        async def process(sim: TestbenchContext):
+            base_addr = 0x00000140
+            words = [0x11111111, 0x22222222]
+
+            await self.m.start_refill.call(sim, addr=base_addr)
+
+            req = await self.m.bus_master_adapter.request_read_mock.call(sim)
+            assert req["addr"] == self.bus_word_addr(base_addr, 0)
+            await self.m.bus_master_adapter.get_read_response_mock.call(sim, data=words[0], err=0)
+
+            resp = await self.m.accept_refill.call(sim)
+            assert resp["addr"] == self.byte_word_addr(base_addr, 0)
+            assert resp["data"] == words[0]
+            assert resp["error"] == 0
+            assert resp["last"] == 0
+
+            req = await self.m.bus_master_adapter.request_read_mock.call(sim)
+            assert req["addr"] == self.bus_word_addr(base_addr, 1)
+            await self.m.bus_master_adapter.get_read_response_mock.call(sim, data=words[1], err=1)
+
+            resp = await self.m.accept_refill.call(sim)
+            assert resp["addr"] == self.byte_word_addr(base_addr, 1)
+            assert resp["data"] == words[1]
+            assert resp["error"] == 1
+            assert resp["last"] == 1
+
+            for _ in range(3):
+                req = await self.m.bus_master_adapter.request_read_mock.call_try(sim)
+                assert req is None
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(process)
+
+    def test_writeback_writes_full_line_and_returns_success(self):
+        async def process(sim: TestbenchContext):
+            base_addr = 0x00000200
+            words = [0xDEADBEEF, 0x11223344, 0x55667788, 0x99AABBCC]
+            self.writeback_words.extend(words)
+
+            await self.m.start_writeback.call(sim, addr=base_addr)
+
+            for word_idx, word in enumerate(words):
+                req = await self.m.bus_master_adapter.request_write_mock.call(sim)
+                assert req["addr"] == self.bus_word_addr(base_addr, word_idx)
+                assert req["data"] == word
+                assert req["sel"] == self.full_sel()
+
+                await self.m.bus_master_adapter.get_write_response_mock.call(sim, err=0)
+
+            resp = await self.m.accept_writeback.call(sim)
+            assert resp["error"] == 0
+            assert not self.writeback_words
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(process)
+
+    def test_writeback_accumulates_error(self):
+        async def process(sim: TestbenchContext):
+            base_addr = 0x00000240
+            words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+            errors = [0, 1, 0, 0]
+            self.writeback_words.extend(words)
+
+            await self.m.start_writeback.call(sim, addr=base_addr)
+
+            for word_idx, word in enumerate(words):
+                req = await self.m.bus_master_adapter.request_write_mock.call(sim)
+                assert req["addr"] == self.bus_word_addr(base_addr, word_idx)
+                assert req["data"] == word
+
+                await self.m.bus_master_adapter.get_write_response_mock.call(sim, err=errors[word_idx])
+
+            resp = await self.m.accept_writeback.call(sim)
+            assert resp["error"] == 1
+            assert not self.writeback_words
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(process)
+
+    def test_start_methods_are_not_ready_while_busy(self):
+        async def process(sim: TestbenchContext):
+            refill_addr = 0x00000300
+            writeback_addr = 0x00000340
+            words = [0xA0A0A0A0, 0xB1B1B1B1, 0xC2C2C2C2, 0xD3D3D3D3]
+
+            await self.m.start_refill.call(sim, addr=refill_addr)
+            ret = await self.m.start_writeback.call_try(sim, addr=writeback_addr)
+            assert ret is None
+
+            for word_idx, word in enumerate(words):
+                await self.m.bus_master_adapter.request_read_mock.call(sim)
+                await self.m.bus_master_adapter.get_read_response_mock.call(sim, data=word, err=0)
+                resp = await self.m.accept_refill.call(sim)
+                assert resp["last"] == int(word_idx == len(words) - 1)
+
+            self.writeback_words.extend(words)
+            await self.m.start_writeback.call(sim, addr=writeback_addr)
+            ret = await self.m.start_refill.call_try(sim, addr=refill_addr)
+            assert ret is None
+
+            for _ in words:
+                await self.m.bus_master_adapter.request_write_mock.call(sim)
+                await self.m.bus_master_adapter.get_write_response_mock.call(sim, err=0)
+
+            resp = await self.m.accept_writeback.call(sim)
+            assert resp["error"] == 0
+
+            ret = await self.m.start_refill.call_try(sim, addr=refill_addr)
+            assert ret is not None
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(process)
+
+
+class DCacheBypassTestCircuit(Elaboratable):
+    def __init__(self, gen_params: GenParams):
+        self.gen_params = gen_params
+        self.cp = self.gen_params.dcache_params
+
+    def elaborate(self, platform):
+        layouts = self.gen_params.get(DCacheLayouts)
+        bus_mock_params = BusMockParameters(
+            data_width=self.gen_params.isa.xlen,
+            addr_width=self.gen_params.wb_params.addr_width,
+        )
+
+        self.bus_master_adapter = MockMasterAdapter(bus_mock_params)
+        self.cache = DCacheBypass(layouts, self.cp, self.bus_master_adapter)
+        self.issue_req = TestbenchIO(AdapterTrans.create(self.cache.issue_req))
+        self.accept_res = TestbenchIO(AdapterTrans.create(self.cache.accept_res))
+        self.flush_cache = TestbenchIO(AdapterTrans.create(self.cache.flush))
+
+        return ModuleConnector(
+            bus_master_adapter=self.bus_master_adapter,
+            cache=self.cache,
+            issue_req=self.issue_req,
+            accept_res=self.accept_res,
+            flush_cache=self.flush_cache,
+        )
+
+
+class TestDCacheBypass(TestCaseWithSimulator):
+    def setup_method(self) -> None:
+        self.gen_params = GenParams(
+            test_core_config.replace(
+                xlen=32,
+                dcache_line_bytes_log=4,
+            )
+        )
+        self.cp = self.gen_params.dcache_params
+        self.m = DCacheBypassTestCircuit(self.gen_params)
+
+    def test_load(self):
+        async def process(sim: TestbenchContext):
+            byte_addr = 0x00000114
+            data = 0x11223344
+            byte_mask = 0b0110
+
+            _, req = await (
+                CallTrigger(sim)
+                .call(self.m.issue_req, addr=byte_addr, data=0, byte_mask=byte_mask, store=0)
+                .call(self.m.bus_master_adapter.request_read_mock)
+            )
+
+            assert req["addr"] == byte_addr >> exact_log2(self.cp.word_width_bytes)
+            assert req["sel"] == byte_mask
+
+            _, resp = await (
+                CallTrigger(sim)
+                .call(self.m.bus_master_adapter.get_read_response_mock, data=data, err=0)
+                .call(self.m.accept_res)
+            )
+
+            assert resp["data"] == data
+            assert resp["error"] == 0
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(process)
+
+    def test_store(self):
+        async def process(sim: TestbenchContext):
+            byte_addr = 0x00000118
+            data = 0xAABBCCDD
+            byte_mask = 0b1100
+
+            _, req = await (
+                CallTrigger(sim)
+                .call(self.m.issue_req, addr=byte_addr, data=data, byte_mask=byte_mask, store=1)
+                .call(self.m.bus_master_adapter.request_write_mock)
+            )
+
+            assert req["addr"] == byte_addr >> exact_log2(self.cp.word_width_bytes)
+            assert req["data"] == data
+            assert req["sel"] == byte_mask
+
+            _, resp = await (
+                CallTrigger(sim).call(self.m.bus_master_adapter.get_write_response_mock, err=0).call(self.m.accept_res)
+            )
+
+            assert resp["data"] == 0
+            assert resp["error"] == 0
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(process)
+
+    def test_error(self):
+        async def process(sim: TestbenchContext):
+            byte_addr = 0x0000011C
+
+            await (
+                CallTrigger(sim)
+                .call(self.m.issue_req, addr=byte_addr, data=0, byte_mask=0b1111, store=0)
+                .call(self.m.bus_master_adapter.request_read_mock)
+            )
+
+            _, resp = await (
+                CallTrigger(sim)
+                .call(self.m.bus_master_adapter.get_read_response_mock, data=0, err=1)
+                .call(self.m.accept_res)
+            )
+
+            assert resp["data"] == 0
+            assert resp["error"] == 1
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(process)
+
+    def test_queue_order(self):
+        async def process(sim: TestbenchContext):
+            base_addr = 0x00000200
+            words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+
+            for word_idx in range(self.cp.request_depth):
+                _, req = await (
+                    CallTrigger(sim)
+                    .call(
+                        self.m.issue_req,
+                        addr=base_addr + word_idx * self.cp.word_width_bytes,
+                        data=0,
+                        byte_mask=0b1111,
+                        store=0,
+                    )
+                    .call(self.m.bus_master_adapter.request_read_mock)
+                )
+                assert req["addr"] == (base_addr >> exact_log2(self.cp.word_width_bytes)) + word_idx
+
+            for word in words:
+                _, resp = await (
+                    CallTrigger(sim)
+                    .call(self.m.bus_master_adapter.get_read_response_mock, data=word, err=0)
+                    .call(self.m.accept_res)
+                )
+                assert resp["data"] == word
+                assert resp["error"] == 0
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(process)
+
+
+class MockedDataCacheRefiller(Elaboratable, DataCacheRefillerInterface):
+    def __init__(self, gen_params: GenParams):
+        layouts = gen_params.get(DCacheLayouts)
+
+        self.start_refill_mock = TestbenchIO(Adapter(i=layouts.start_refill))
+        self.accept_refill_mock = TestbenchIO(Adapter(o=layouts.accept_refill))
+        self.start_writeback_mock = TestbenchIO(Adapter(i=layouts.start_writeback))
+        self.accept_writeback_mock = TestbenchIO(Adapter(o=layouts.accept_writeback))
+
+        self.start_refill = self.start_refill_mock.adapter.iface
+        self.accept_refill = self.accept_refill_mock.adapter.iface
+        self.start_writeback = self.start_writeback_mock.adapter.iface
+        self.accept_writeback = self.accept_writeback_mock.adapter.iface
+
+    def elaborate(self, platform):
+        return ModuleConnector(
+            start_refill=self.start_refill_mock,
+            accept_refill=self.accept_refill_mock,
+            start_writeback=self.start_writeback_mock,
+            accept_writeback=self.accept_writeback_mock,
+        )
+
+
+class DCacheTestCircuit(Elaboratable):
+    def __init__(self, gen_params: GenParams):
+        self.gen_params = gen_params
+        self.cp = self.gen_params.dcache_params
+
+    def elaborate(self, platform):
+        self.refiller = MockedDataCacheRefiller(self.gen_params)
+        self.cache = DCache(self.gen_params.get(DCacheLayouts), self.cp, self.refiller)
+        self.issue_req = TestbenchIO(AdapterTrans.create(self.cache.issue_req))
+        self.accept_res = TestbenchIO(AdapterTrans.create(self.cache.accept_res))
+        self.flush_cache = TestbenchIO(AdapterTrans.create(self.cache.flush))
+        self.provide_writeback_data = TestbenchIO(AdapterTrans.create(self.cache.provide_writeback_data))
+
+        return ModuleConnector(
+            refiller=self.refiller,
+            cache=self.cache,
+            issue_req=self.issue_req,
+            accept_res=self.accept_res,
+            flush_cache=self.flush_cache,
+            provide_writeback_data=self.provide_writeback_data,
+        )
+
+
+class TestDCache(TestCaseWithSimulator):
+    def setup_method(self) -> None:
+        self.gen_params = GenParams(
+            test_core_config.replace(
+                xlen=32,
+                dcache_ways=2,
+                dcache_sets_bits=2,
+                dcache_line_bytes_log=4,
+            )
+        )
+        self.cp = self.gen_params.dcache_params
+        self.m = DCacheTestCircuit(self.gen_params)
+        self.refill_start_calls = deque()
+        self.refill_responses = deque()
+        self.writeback_start_calls = deque()
+        self.writeback_accept_responses = deque()
+        self.allow_writeback_accept = False
+
+    @def_method_mock(lambda self: self.m.refiller.start_refill_mock, enable=lambda self: True)
+    def start_refill_unexpected(self, addr):
+        @MethodMock.effect
+        def eff():
+            self.refill_start_calls.append(addr)
+            if not self.refill_responses:
+                self.refill_responses.append({"addr": addr, "data": 0, "error": 1, "last": 1})
+
+    @def_method_mock(lambda self: self.m.refiller.accept_refill_mock, enable=lambda self: True)
+    def accept_refill_unexpected(self):
+        @MethodMock.effect
+        def eff():
+            if not self.refill_responses:
+                raise AssertionError("unexpected accept_refill call")
+            self.refill_responses.popleft()
+
+        if self.refill_responses:
+            return self.refill_responses[0]
+        return {"addr": 0, "data": 0, "error": 0, "last": 1}
+
+    @def_method_mock(lambda self: self.m.refiller.start_writeback_mock, enable=lambda self: True)
+    def start_writeback_unexpected(self, addr):
+        @MethodMock.effect
+        def eff():
+            self.writeback_start_calls.append(addr)
+
+    @def_method_mock(
+        lambda self: self.m.refiller.accept_writeback_mock,
+        enable=lambda self: self.allow_writeback_accept and bool(self.writeback_accept_responses),
+    )
+    def accept_writeback_unexpected(self):
+        @MethodMock.effect
+        def eff():
+            if not self.writeback_accept_responses:
+                raise AssertionError("unexpected accept_writeback call")
+            self.writeback_accept_responses.popleft()
+
+        return self.writeback_accept_responses[0]
+
+    def split_addr(self, addr: int) -> tuple[int, int, int]:
+        index = (addr >> self.cp.offset_bits) & (self.cp.num_of_sets - 1)
+        tag = addr >> (self.cp.offset_bits + self.cp.index_bits)
+        word_offset = (addr & (self.cp.line_size_bytes - 1)) >> exact_log2(self.cp.word_width_bytes)
+        return tag, index, word_offset
+
+    def encode_tag_entry(self, *, valid: int, dirty: int, tag: int) -> dict[str, int]:
+        return {"valid": valid, "dirty": dirty, "tag": tag}
+
+    def line_word_addr(self, index: int, word_offset: int) -> int:
+        return (index << exact_log2(self.cp.words_in_line)) | word_offset
+
+    def merge_word(self, initial: int, new: int, byte_mask: int) -> int:
+        result = initial
+        for byte in range(self.cp.word_width_bytes):
+            if byte_mask & (1 << byte):
+                byte_shift = byte * 8
+                result &= ~(0xFF << byte_shift)
+                result |= ((new >> byte_shift) & 0xFF) << byte_shift
+        return result
+
+    async def wait_for_flush(self, sim: TestbenchContext):
+        for _ in range(self.cp.num_of_sets * 3 + 4):
+            await sim.tick()
+
+    async def load_line_directly(
+        self, sim: TestbenchContext, addr_base: int, words: list[int], *, way: int = 0, dirty: int = 0
+    ):
+        tag, index, _ = self.split_addr(addr_base)
+        sim.set(
+            self.m.cache.mem.tag_mems[way].data[index],  # type: ignore[arg-type]
+            self.encode_tag_entry(valid=1, dirty=dirty, tag=tag),
+        )
+
+        for word_offset, word in enumerate(words):
+            mem_addr = self.line_word_addr(index, word_offset)
+            sim.set(self.m.cache.mem.data_mems[way].data[mem_addr], word)  # type: ignore[arg-type]
+
+        await sim.tick()
+
+    async def call_cache(self, sim: TestbenchContext, *, addr: int, data: int = 0, byte_mask: int = 0, store: int = 0):
+        await self.m.issue_req.call(sim, addr=addr, data=data, byte_mask=byte_mask, store=store)
+        return await self.m.accept_res.call(sim)
+
+    def queue_refill_line(self, line_addr: int, words: list[int], *, error: int = 0):
+        for i, word in enumerate(words):
+            self.refill_responses.append(
+                {
+                    "addr": line_addr + i * self.cp.word_width_bytes,
+                    "data": word,
+                    "error": error,
+                    "last": int(i == len(words) - 1 or error),
+                }
+            )
+            if error:
+                break
+
+    async def collect_writeback_line(self, sim: TestbenchContext, *, words_in_line: int) -> list[int]:
+        words = []
+        await sim.tick()
+        for _ in range(words_in_line):
+            resp = await self.m.provide_writeback_data.call(sim)
+            words.append(resp["data"])
+            await sim.tick()
+        return words
+
+    async def wait_until(self, sim: TestbenchContext, pred, *, max_ticks: int = 50):
+        for _ in range(max_ticks):
+            if pred():
+                return
+            await sim.tick()
+        raise AssertionError("condition not met in time")
+
+    def read_tag_entry(self, sim: TestbenchContext, *, way: int, index: int) -> dict[str, int]:
+        raw_tag = sim.get(self.m.cache.mem.tag_mems[way].data[index])  # type: ignore[arg-type]
+        return {
+            "valid": raw_tag["valid"],
+            "dirty": raw_tag["dirty"],
+            "tag": raw_tag["tag"],
+        }
+
+    def read_data_word(self, sim: TestbenchContext, *, way: int, index: int, word_offset: int) -> int:
+        mem_addr = self.line_word_addr(index, word_offset)
+        return sim.get(self.m.cache.mem.data_mems[way].data[mem_addr])  # type: ignore[arg-type]
+
+    def same_set_addr(self, addr: int, distance: int) -> int:
+        return addr + distance * self.cp.num_of_sets * self.cp.line_size_bytes
+
+    def test_miss_returns_error_on_empty_refiller(self):
+        async def cache_process(sim: TestbenchContext):
+            await self.wait_for_flush(sim)
+
+            resp = await self.call_cache(sim, addr=0x00000100)
+
+            assert resp["error"] == 1
+            assert resp["data"] == 0
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_load_hit(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x00000120
+            words = [0x11223344, 0x55667788, 0x99AABBCC, 0xDDEEFF00]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, base_addr, words, way=0, dirty=0)
+
+            resp = await self.call_cache(sim, addr=base_addr + self.cp.word_width_bytes)
+
+            assert resp["error"] == 0
+            assert resp["data"] == words[1]
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_store_hit(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x00000140
+            initial_words = [0x11223344, 0x55667788, 0x99AABBCC, 0xDDEEFF00]
+            store_addr = base_addr + self.cp.word_width_bytes
+            store_data = 0xAABBCCDD
+            byte_mask = 0b0101
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, base_addr, initial_words, way=0, dirty=0)
+
+            resp = await self.call_cache(sim, addr=store_addr, data=store_data, byte_mask=byte_mask, store=1)
+
+            assert resp["error"] == 0
+            assert resp["data"] == 0
+
+            await sim.tick()
+
+            tag, index, word_offset = self.split_addr(store_addr)
+            expected_word = self.merge_word(initial_words[1], store_data, byte_mask)
+
+            stored_word = self.read_data_word(sim, way=0, index=index, word_offset=word_offset)
+            stored_tag = self.read_tag_entry(sim, way=0, index=index)
+
+            assert stored_word == expected_word
+            assert stored_tag["valid"] == 1
+            assert stored_tag["dirty"] == 1
+            assert stored_tag["tag"] == tag
+
+            load_resp = await self.call_cache(sim, addr=store_addr)
+            assert load_resp["error"] == 0
+            assert load_resp["data"] == expected_word
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_requests_queued(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x00000180
+            words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, base_addr, words, way=0, dirty=0)
+
+            await self.m.issue_req.call(sim, addr=base_addr, data=0, byte_mask=0, store=0)
+
+            ret = await self.m.issue_req.call_try(
+                sim, addr=base_addr + self.cp.word_width_bytes, data=0, byte_mask=0, store=0
+            )
+            assert ret is not None
+
+            first_resp = await self.m.accept_res.call(sim)
+            assert first_resp["error"] == 0
+            assert first_resp["data"] == words[0]
+
+            second_resp = await self.m.accept_res.call(sim)
+            assert second_resp["error"] == 0
+            assert second_resp["data"] == words[1]
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_request_queue_full(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x00000180
+            words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, base_addr, words, way=0, dirty=0)
+
+            for word_offset in range(self.cp.request_depth):
+                await self.m.issue_req.call(
+                    sim,
+                    addr=base_addr + (word_offset % self.cp.words_in_line) * self.cp.word_width_bytes,
+                    data=0,
+                    byte_mask=0,
+                    store=0,
+                )
+
+            ret = await self.m.issue_req.call_try(sim, addr=base_addr, data=0, byte_mask=0, store=0)
+            assert ret is None
+
+            for word_offset in range(self.cp.request_depth):
+                resp = await self.m.accept_res.call(sim)
+                assert resp["error"] == 0
+                assert resp["data"] == words[word_offset % self.cp.words_in_line]
+
+            ret = await self.m.issue_req.call_try(sim, addr=base_addr, data=0, byte_mask=0, store=0)
+            assert ret is not None
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_flush_invalidates(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x000001C0
+            words = [0xCAFEBABE, 0x0BADF00D, 0x12345678, 0x89ABCDEF]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, base_addr, words, way=0, dirty=0)
+
+            hit_resp = await self.call_cache(sim, addr=base_addr)
+            assert hit_resp["error"] == 0
+            assert hit_resp["data"] == words[0]
+
+            await self.m.flush_cache.call(sim)
+            await self.wait_for_flush(sim)
+
+            miss_resp = await self.call_cache(sim, addr=base_addr)
+            assert miss_resp["error"] == 1
+            assert miss_resp["data"] == 0
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_load_miss(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x00000200
+            words = [0xAAAABBBB, 0xCCCCDDDD, 0x11112222, 0x33334444]
+
+            await self.wait_for_flush(sim)
+            self.queue_refill_line(base_addr, words)
+
+            resp = await self.call_cache(sim, addr=base_addr + self.cp.word_width_bytes)
+
+            assert list(self.refill_start_calls) == [base_addr]
+            assert resp["error"] == 0
+            assert resp["data"] == words[1]
+            assert not self.refill_responses
+
+            hit_resp = await self.call_cache(sim, addr=base_addr + 2 * self.cp.word_width_bytes)
+            assert hit_resp["error"] == 0
+            assert hit_resp["data"] == words[2]
+            assert list(self.refill_start_calls) == [base_addr]
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_store_miss(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x00000240
+            initial_words = [0x10203040, 0x50607080, 0x90A0B0C0, 0xD0E0F000]
+            store_addr = base_addr + self.cp.word_width_bytes
+            store_data = 0x11223344
+            byte_mask = 0b0011
+
+            await self.wait_for_flush(sim)
+            self.queue_refill_line(base_addr, initial_words)
+
+            resp = await self.call_cache(sim, addr=store_addr, data=store_data, byte_mask=byte_mask, store=1)
+
+            assert list(self.refill_start_calls) == [base_addr]
+            assert resp["error"] == 0
+            assert resp["data"] == 0
+            assert not self.refill_responses
+
+            await sim.tick()
+
+            tag, index, word_offset = self.split_addr(store_addr)
+            expected_word = self.merge_word(initial_words[1], store_data, byte_mask)
+            stored_word = self.read_data_word(sim, way=0, index=index, word_offset=word_offset)
+            stored_tag = self.read_tag_entry(sim, way=0, index=index)
+
+            assert stored_word == expected_word
+            assert stored_tag["valid"] == 1
+            assert stored_tag["dirty"] == 1
+            assert stored_tag["tag"] == tag
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_refill_with_queued_requests(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x00000260
+            words = [0x10203040, 0x50607080, 0x90A0B0C0, 0xD0E0F000]
+
+            await self.wait_for_flush(sim)
+            self.queue_refill_line(base_addr, words)
+
+            await self.m.issue_req.call(
+                sim,
+                addr=base_addr + self.cp.word_width_bytes,
+                data=0,
+                byte_mask=0,
+                store=0,
+            )
+            ret = await self.m.issue_req.call_try(
+                sim,
+                addr=base_addr + 2 * self.cp.word_width_bytes,
+                data=0,
+                byte_mask=0,
+                store=0,
+            )
+            assert ret is not None
+
+            first_resp = await self.m.accept_res.call(sim)
+            second_resp = await self.m.accept_res.call(sim)
+
+            assert list(self.refill_start_calls) == [base_addr]
+            assert first_resp["error"] == 0
+            assert first_resp["data"] == words[1]
+            assert second_resp["error"] == 0
+            assert second_resp["data"] == words[2]
+            assert not self.refill_responses
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_refill_error(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x00000280
+            words = [0xABCDEF01, 0x23456789, 0x3456789A, 0x456789AB]
+
+            await self.wait_for_flush(sim)
+            self.queue_refill_line(base_addr, words, error=1)
+
+            resp = await self.call_cache(sim, addr=base_addr)
+
+            assert list(self.refill_start_calls) == [base_addr]
+            assert resp["error"] == 1
+            assert resp["data"] == 0
+            assert not self.refill_responses
+
+            _, index, _ = self.split_addr(base_addr)
+            way0_entry = self.read_tag_entry(sim, way=0, index=index)
+            way1_entry = self.read_tag_entry(sim, way=1, index=index)
+
+            assert way0_entry["valid"] == 0
+            assert way1_entry["valid"] == 0
+            assert not self.writeback_start_calls
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_load_miss_multiple_lines(self):
+        async def cache_process(sim: TestbenchContext):
+            old_base_addr = 0x00000100
+            old_words = [0xDEADBEEF, 0x11223344, 0x55667788, 0x99AABBCC]
+            other_base_addr = 0x00000140
+            other_words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+            new_base_addr = 0x00000200
+            new_words = [0xAAAABBBB, 0xCCCCDDDD, 0xEEEEFFFF, 0x12345678]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, old_base_addr, old_words, way=0, dirty=1)
+            await self.load_line_directly(sim, other_base_addr, other_words, way=1, dirty=0)
+            self.queue_refill_line(new_base_addr, new_words)
+            self.writeback_accept_responses.append({"error": 0})
+
+            await self.m.issue_req.call(
+                sim, addr=new_base_addr + self.cp.word_width_bytes, data=0, byte_mask=0, store=0
+            )
+
+            await self.wait_until(sim, lambda: len(self.writeback_start_calls) == 1)
+            assert list(self.writeback_start_calls) == [old_base_addr]
+            assert not self.refill_start_calls
+
+            written_back_words = await self.collect_writeback_line(sim, words_in_line=self.cp.words_in_line)
+            assert written_back_words == old_words
+            assert not self.refill_start_calls
+
+            self.allow_writeback_accept = True
+            resp = await self.m.accept_res.call(sim)
+
+            assert list(self.refill_start_calls) == [new_base_addr]
+            assert resp["error"] == 0
+            assert resp["data"] == new_words[1]
+            assert not self.refill_responses
+
+            _, index, _ = self.split_addr(new_base_addr)
+            new_tag, _, _ = self.split_addr(new_base_addr)
+            stored_tag = self.read_tag_entry(sim, way=0, index=index)
+            hit_resp = await self.call_cache(sim, addr=new_base_addr + 2 * self.cp.word_width_bytes)
+
+            assert stored_tag["valid"] == 1
+            assert stored_tag["tag"] == new_tag
+            assert hit_resp["error"] == 0
+            assert hit_resp["data"] == new_words[2]
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_store_miss_multiple_lines(self):
+        async def cache_process(sim: TestbenchContext):
+            old_base_addr = 0x00000140
+            old_words = [0xCAFEBABE, 0x0BADF00D, 0x01020304, 0xA0B0C0D0]
+            other_base_addr = 0x00000100
+            other_words = [0xDEADBEEF, 0x11223344, 0x55667788, 0x99AABBCC]
+            new_base_addr = 0x00000240
+            new_words = [0x10203040, 0x50607080, 0x90A0B0C0, 0xD0E0F000]
+            store_addr = new_base_addr + self.cp.word_width_bytes
+            store_data = 0x11223344
+            byte_mask = 0b0011
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, old_base_addr, old_words, way=0, dirty=1)
+            await self.load_line_directly(sim, other_base_addr, other_words, way=1, dirty=0)
+            self.queue_refill_line(new_base_addr, new_words)
+            self.writeback_accept_responses.append({"error": 0})
+
+            await self.m.issue_req.call(sim, addr=store_addr, data=store_data, byte_mask=byte_mask, store=1)
+
+            await self.wait_until(sim, lambda: len(self.writeback_start_calls) == 1)
+            assert list(self.writeback_start_calls) == [old_base_addr]
+            assert not self.refill_start_calls
+
+            written_back_words = await self.collect_writeback_line(sim, words_in_line=self.cp.words_in_line)
+            assert written_back_words == old_words
+            assert not self.refill_start_calls
+
+            self.allow_writeback_accept = True
+            resp = await self.m.accept_res.call(sim)
+
+            assert list(self.refill_start_calls) == [new_base_addr]
+            assert resp["error"] == 0
+            assert resp["data"] == 0
+            assert not self.refill_responses
+
+            await sim.tick()
+
+            new_tag, index, word_offset = self.split_addr(store_addr)
+            expected_word = self.merge_word(new_words[1], store_data, byte_mask)
+            stored_word = self.read_data_word(sim, way=0, index=index, word_offset=word_offset)
+            stored_tag = self.read_tag_entry(sim, way=0, index=index)
+
+            assert stored_word == expected_word
+            assert stored_tag["valid"] == 1
+            assert stored_tag["dirty"] == 1
+            assert stored_tag["tag"] == new_tag
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_miss_writeback_error(self):
+        async def cache_process(sim: TestbenchContext):
+            old_base_addr = 0x00000100
+            old_words = [0xDEADBEEF, 0x11223344, 0x55667788, 0x99AABBCC]
+            other_base_addr = 0x00000140
+            other_words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+            new_base_addr = 0x00000200
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, old_base_addr, old_words, way=0, dirty=1)
+            await self.load_line_directly(sim, other_base_addr, other_words, way=1, dirty=0)
+            self.writeback_accept_responses.append({"error": 1})
+
+            await self.m.issue_req.call(sim, addr=new_base_addr, data=0, byte_mask=0, store=0)
+
+            await self.wait_until(sim, lambda: len(self.writeback_start_calls) == 1)
+            assert list(self.writeback_start_calls) == [old_base_addr]
+            assert not self.refill_start_calls
+
+            written_back_words = await self.collect_writeback_line(sim, words_in_line=self.cp.words_in_line)
+            assert written_back_words == old_words
+
+            self.allow_writeback_accept = True
+            resp = await self.m.accept_res.call(sim)
+
+            assert resp["error"] == 1
+            assert resp["data"] == 0
+            assert not self.refill_start_calls
+
+            old_tag, index, _ = self.split_addr(old_base_addr)
+            stored_tag = self.read_tag_entry(sim, way=0, index=index)
+            assert stored_tag["valid"] == 1
+            assert stored_tag["dirty"] == 1
+            assert stored_tag["tag"] == old_tag
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_flush_same_set(self):
+        async def cache_process(sim: TestbenchContext):
+            first_cache_line_addr = 0x00000100
+            first_words = [0xDEADBEEF, 0x11223344, 0x55667788, 0x99AABBCC]
+            second_cache_line_addr = 0x00000200
+            second_words = [0xDEADBEE9, 0x11223349, 0x55667789, 0x99AABBC9]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, first_cache_line_addr, first_words, way=0, dirty=1)
+            await self.load_line_directly(sim, second_cache_line_addr, second_words, way=1, dirty=0)
+
+            self.writeback_accept_responses.append({"error": 0})
+
+            await self.m.flush_cache.call(sim)
+            await self.wait_until(sim, lambda: len(self.writeback_start_calls) == 1)
+            assert list(self.writeback_start_calls) == [first_cache_line_addr]
+            assert not self.refill_start_calls
+
+            written_back_words = await self.collect_writeback_line(sim, words_in_line=self.cp.words_in_line)
+            assert written_back_words == first_words
+
+            self.allow_writeback_accept = True
+            await self.wait_for_flush(sim)
+
+            _, first_index, _ = self.split_addr(first_cache_line_addr)
+            _, second_index, _ = self.split_addr(second_cache_line_addr)
+
+            first_tag = self.read_tag_entry(sim, way=0, index=first_index)
+            second_tag = self.read_tag_entry(sim, way=1, index=second_index)
+
+            assert first_tag["valid"] == 0
+            assert first_tag["dirty"] == 0
+            assert second_tag["valid"] == 0
+            assert second_tag["dirty"] == 0
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_flush_writeback_error(self):
+        async def cache_process(sim: TestbenchContext):
+            base_addr = 0x00000100
+            words = [0xDEADBEEF, 0x11223344, 0x55667788, 0x99AABBCC]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, base_addr, words, way=0, dirty=1)
+            self.writeback_accept_responses.append({"error": 1})
+
+            await self.m.flush_cache.call(sim)
+            await self.wait_until(sim, lambda: len(self.writeback_start_calls) == 1)
+            assert list(self.writeback_start_calls) == [base_addr]
+
+            written_back_words = await self.collect_writeback_line(sim, words_in_line=self.cp.words_in_line)
+            assert written_back_words == words
+
+            self.allow_writeback_accept = True
+            await self.wait_for_flush(sim)
+
+            tag, index, _ = self.split_addr(base_addr)
+            stored_tag = self.read_tag_entry(sim, way=0, index=index)
+
+            assert stored_tag["valid"] == 1
+            assert stored_tag["dirty"] == 1
+            assert stored_tag["tag"] == tag
+            assert not self.refill_start_calls
+
+            hit_resp = await self.call_cache(sim, addr=base_addr)
+            assert hit_resp["error"] == 0
+            assert hit_resp["data"] == words[0]
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_miss_prefers_invalid_way(self):
+        async def cache_process(sim: TestbenchContext):
+            way0_addr = 0x00000100
+            refill_addr = self.same_set_addr(way0_addr, 1)
+            way0_words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+            refill_words = [0xAABBCCDD, 0x10203040, 0x50607080, 0x90A0B0C0]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, way0_addr, way0_words, way=0, dirty=0)
+            self.queue_refill_line(refill_addr, refill_words)
+
+            resp = await self.call_cache(sim, addr=refill_addr)
+
+            assert resp["error"] == 0
+            assert resp["data"] == refill_words[0]
+            assert list(self.refill_start_calls) == [refill_addr]
+
+            way0_tag, index, _ = self.split_addr(way0_addr)
+            refill_tag, _, _ = self.split_addr(refill_addr)
+            way0_entry = self.read_tag_entry(sim, way=0, index=index)
+            way1_entry = self.read_tag_entry(sim, way=1, index=index)
+
+            assert way0_entry["valid"] == 1
+            assert way0_entry["tag"] == way0_tag
+            assert way1_entry["valid"] == 1
+            assert way1_entry["tag"] == refill_tag
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_miss_uses_round_robin(self):
+        async def cache_process(sim: TestbenchContext):
+            way0_addr = 0x00000100
+            way1_addr = self.same_set_addr(way0_addr, 1)
+            refill_addr = self.same_set_addr(way0_addr, 2)
+            way0_words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+            way1_words = [0x41424344, 0x51525354, 0x61626364, 0x71727374]
+            refill_words = [0xAABBCCDD, 0x10203040, 0x50607080, 0x90A0B0C0]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, way0_addr, way0_words, way=0, dirty=0)
+            await self.load_line_directly(sim, way1_addr, way1_words, way=1, dirty=0)
+            self.queue_refill_line(refill_addr, refill_words)
+
+            resp = await self.call_cache(sim, addr=refill_addr)
+
+            assert resp["error"] == 0
+            assert resp["data"] == refill_words[0]
+            assert list(self.refill_start_calls) == [refill_addr]
+
+            refill_tag, index, _ = self.split_addr(refill_addr)
+            way1_tag, _, _ = self.split_addr(way1_addr)
+            way0_entry = self.read_tag_entry(sim, way=0, index=index)
+            way1_entry = self.read_tag_entry(sim, way=1, index=index)
+
+            assert way0_entry["valid"] == 1
+            assert way0_entry["tag"] == refill_tag
+            assert way1_entry["valid"] == 1
+            assert way1_entry["tag"] == way1_tag
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
+
+    def test_round_robin_advances_only_if_used(self):
+        async def cache_process(sim: TestbenchContext):
+            way0_addr = 0x00000100
+            way1_addr = self.same_set_addr(way0_addr, 1)
+            first_refill_addr = self.same_set_addr(way0_addr, 2)
+            second_refill_addr = self.same_set_addr(way0_addr, 3)
+            third_refill_addr = self.same_set_addr(way0_addr, 4)
+            way0_words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+            first_refill_words = [0xA0A0A0A0, 0xA1A1A1A1, 0xA2A2A2A2, 0xA3A3A3A3]
+            way1_words = [0xB0B0B0B0, 0xB1B1B1B1, 0xB2B2B2B2, 0xB3B3B3B3]
+            second_refill_words = [0xC0C0C0C0, 0xC1C1C1C1, 0xC2C2C2C2, 0xC3C3C3C3]
+            third_refill_words = [0xD0D0D0D0, 0xD1D1D1D1, 0xD2D2D2D2, 0xD3D3D3D3]
+
+            await self.wait_for_flush(sim)
+            await self.load_line_directly(sim, way0_addr, way0_words, way=0, dirty=0)
+
+            self.queue_refill_line(first_refill_addr, first_refill_words)
+            first_resp = await self.call_cache(sim, addr=first_refill_addr)
+            assert first_resp["error"] == 0
+
+            _, index, _ = self.split_addr(way0_addr)
+            first_refill_tag, _, _ = self.split_addr(first_refill_addr)
+            assert self.read_tag_entry(sim, way=1, index=index)["tag"] == first_refill_tag
+
+            await self.load_line_directly(sim, way1_addr, way1_words, way=1, dirty=0)
+            self.queue_refill_line(second_refill_addr, second_refill_words)
+            second_resp = await self.call_cache(sim, addr=second_refill_addr)
+            assert second_resp["error"] == 0
+
+            second_refill_tag, _, _ = self.split_addr(second_refill_addr)
+            assert self.read_tag_entry(sim, way=0, index=index)["tag"] == second_refill_tag
+
+            self.queue_refill_line(third_refill_addr, third_refill_words)
+            third_resp = await self.call_cache(sim, addr=third_refill_addr)
+            assert third_resp["error"] == 0
+
+            third_refill_tag, _, _ = self.split_addr(third_refill_addr)
+            assert self.read_tag_entry(sim, way=1, index=index)["tag"] == third_refill_tag
+
+        with self.run_simulation(self.m) as sim:
+            sim.add_testbench(cache_process)
diff --git a/test/func_blocks/lsu/test_dummylsu.py b/test/func_blocks/lsu/test_dummylsu.py
index 92514f992..99220b0e5 100644
--- a/test/func_blocks/lsu/test_dummylsu.py
+++ b/test/func_blocks/lsu/test_dummylsu.py
@@ -184,7 +184,7 @@ def generate_instr(self, max_reg_val, max_imm_val):
     def setup_method(self) -> None:
         random.seed(14)
         self.tests_number = 100
-        self.gen_params = GenParams(test_core_config.replace(phys_regs_bits=3, rob_entries_bits=4))
+        self.gen_params = GenParams(test_core_config.replace(phys_regs_bits=3, rob_entries_bits=4, dcache_enable=False))
         self.test_module = DummyLSUTestCircuit(self.gen_params)
         self.instr_queue = deque()
         self.mem_data_queue = deque()
@@ -296,7 +296,7 @@ def generate_instr(self, max_reg_val, max_imm_val):
 
     def setup_method(self) -> None:
         random.seed(14)
-        self.gen_params = GenParams(test_core_config.replace(phys_regs_bits=3, rob_entries_bits=3))
+        self.gen_params = GenParams(test_core_config.replace(phys_regs_bits=3, rob_entries_bits=3, dcache_enable=False))
         self.test_module = DummyLSUTestCircuit(self.gen_params)
 
     async def one_instr_test(self, sim: TestbenchContext):
@@ -375,7 +375,7 @@ def generate_instr(self, max_reg_val, max_imm_val):
     def setup_method(self) -> None:
         random.seed(14)
         self.tests_number = 100
-        self.gen_params = GenParams(test_core_config.replace(phys_regs_bits=3, rob_entries_bits=3))
+        self.gen_params = GenParams(test_core_config.replace(phys_regs_bits=3, rob_entries_bits=3, dcache_enable=False))
         self.test_module = DummyLSUTestCircuit(self.gen_params)
         self.instr_queue = deque()
         self.mem_data_queue = deque()
@@ -462,7 +462,7 @@ async def process(self, sim: TestbenchContext):
         await self.push_one_instr(sim, self.get_instr(load_fn))
 
     def test_fence(self):
-        self.gen_params = GenParams(test_core_config.replace(phys_regs_bits=3, rob_entries_bits=3))
+        self.gen_params = GenParams(test_core_config.replace(phys_regs_bits=3, rob_entries_bits=3, dcache_enable=False))
         self.test_module = DummyLSUTestCircuit(self.gen_params)
 
         @def_method_mock(lambda: self.test_module.exception_report)
@@ -495,3 +495,81 @@ def eff():
 
         with self.run_simulation(self.test_module) as sim:
             sim.add_testbench(self.process)
+
+
+class TestDummyLSUDCacheIntegration(TestCaseWithSimulator):
+    def setup_method(self) -> None:
+        self.gen_params = GenParams(
+            test_core_config.replace(
+                phys_regs_bits=3,
+                rob_entries_bits=3,
+                dcache_enable=True,
+                dcache_ways=2,
+                dcache_sets_bits=2,
+                dcache_line_bytes_log=4,
+            )
+        )
+        self.test_module = DummyLSUTestCircuit(self.gen_params)
+        self.cp = self.gen_params.dcache_params
+
+    def get_load_instr(self, addr: int, rob_id: int):
+        return {
+            "rp_dst": 1,
+            "rob_id": rob_id,
+            "exec_fn": {"op_type": OpType.LOAD, "funct3": Funct3.W, "funct7": 0},
+            "s1_val": addr,
+            "s2_val": 0,
+            "imm": 0,
+            "pc": 0,
+        }
+
+    async def respond_to_refill(self, sim: TestbenchContext, base_addr: int, words: list[int]):
+        for word_idx, word in enumerate(words):
+            req = await self.test_module.bus_master_adapter.request_read_mock.call(sim)
+            assert req["addr"] == (base_addr >> 2) + word_idx
+            assert req["sel"] == 0xF
+            await self.test_module.bus_master_adapter.get_read_response_mock.call(sim, data=word, err=0)
+
+    async def process(self, sim: TestbenchContext):
+        base_addr = 0x00000100
+        words = [0x01020304, 0x11121314, 0x21222324, 0x31323334]
+
+        await self.test_module.issue.call(sim, self.get_load_instr(base_addr, rob_id=1))
+        await self.respond_to_refill(sim, base_addr, words)
+
+        first_result = await self.test_module.push_result.call(sim)
+        assert first_result["rob_id"] == 1
+        assert first_result["result"] == words[0]
+        assert first_result["exception"] == 0
+
+        for word_idx in range(1, self.cp.words_in_line):
+            addr = base_addr + word_idx * self.cp.word_width_bytes
+            await self.test_module.issue.call(sim, self.get_load_instr(addr, rob_id=word_idx + 1))
+
+            for _ in range(4):
+                req = await self.test_module.bus_master_adapter.request_read_mock.call_try(sim)
+                assert req is None
+                await sim.tick()
+
+            result = await self.test_module.push_result.call(sim)
+            assert result["rob_id"] == word_idx + 1
+            assert result["result"] == words[word_idx]
+            assert result["exception"] == 0
+
+    def test_first_load_refills_and_following_loads_hit(self):
+        @def_method_mock(lambda: self.test_module.exception_report)
+        def exception_consumer(arg):
+            @MethodMock.effect
+            def eff():
+                assert False
+
+        @def_method_mock(lambda: self.test_module.precommit, validate_arguments=lambda rob_id: True)
+        def precommiter(rob_id):
+            return {"side_fx": 1}
+
+        @def_method_mock(lambda: self.test_module.core_state)
+        def core_state_process():
+            return {"flushing": 0}
+
+        with self.run_simulation(self.test_module) as sim:
+            sim.add_testbench(self.process)
diff --git a/test/func_blocks/lsu/test_pma.py b/test/func_blocks/lsu/test_pma.py
index 081bf1f51..feac174ae 100644
--- a/test/func_blocks/lsu/test_pma.py
+++ b/test/func_blocks/lsu/test_pma.py
@@ -35,7 +35,7 @@ def test_pma_direct(self):
             PMARegion(0x121, 0x130, False),
         ]
 
-        self.gen_params = GenParams(test_core_config.replace(pma=self.pma_regions))
+        self.gen_params = GenParams(test_core_config.replace(pma=self.pma_regions, dcache_enable=False))
         self.test_module = PMAChecker(self.gen_params)
 
         with self.run_simulation(self.test_module) as sim:
@@ -125,7 +125,7 @@ def test_pma_indirect(self):
             PMARegion(0x10, 0x1F, False),
             PMARegion(0x20, 0x2F, True),
         ]
-        self.gen_params = GenParams(test_core_config.replace(pma=self.pma_regions))
+        self.gen_params = GenParams(test_core_config.replace(pma=self.pma_regions, dcache_enable=False))
         self.test_module = PMAIndirectTestCircuit(self.gen_params)
         self.precommit_enabled = False