diff --git a/components/sdmmc/Kconfig b/components/sdmmc/Kconfig
index 0d2ed43feeec..0acfebdc4da0 100644
--- a/components/sdmmc/Kconfig
+++ b/components/sdmmc/Kconfig
@@ -5,4 +5,21 @@ menu "SD Protocol Layer Configuration"
         help
             Enable SDIO support.
             Disabling this will skip SDIO-specific initialization steps
+    config SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE
+        int "The maximum size of the chunks a SDMMC read/write to/from an unaligned buffer will be split into"
+        range 1 99999
+        default 1
+        help
+            The maximum size in blocks of the chunks a SDMMC read/write with an unaligned buffer will be split into.
+            The SDMMC driver requires aligned buffers for DMA access. If unaligned buffers are passed and the host's
+            dma_aligned_buffer is NULL, an aligned temporary buffer must be allocated for the actual transfer.
+            This option defines the maximum size for the temporary buffer, which equals this option's value multiplied
+            with the block size (typically 512 Bytes). A value of 16 therefore leads to up to 8192 bytes being
+            allocated on the heap for each transfer. The allocated buffer will never be larger than the number of bytes
+            to transfer in total.
+            It also decides whether single (value == 1) or multi block read/write (value > 1) commands are used.
+            With the default value of 1 single-block read/write commands will be used with the allocated buffer size
+            matching the block size.
+            You should keep this option at 1 if your card or configuration doesn't support the read or write multiple
+            blocks commands (CMD18 & CMD25).
 endmenu
diff --git a/components/sdmmc/include/sd_protocol_types.h b/components/sdmmc/include/sd_protocol_types.h
index d7c5196278db..1e0595c3a2b1 100644
--- a/components/sdmmc/include/sd_protocol_types.h
+++ b/components/sdmmc/include/sd_protocol_types.h
@@ -224,7 +224,17 @@ typedef struct {
     sdmmc_delay_phase_t input_delay_phase; /*!< input delay phase, this will only take into effect when the host works in SDMMC_FREQ_HIGHSPEED or SDMMC_FREQ_52M. Driver will print out how long the delay is*/
     esp_err_t (*set_input_delay)(int slot, sdmmc_delay_phase_t delay_phase); /*!< set input delay phase */
     esp_err_t (*set_input_delayline)(int slot, sdmmc_delay_line_t delay_line); /*!< set input delay line */
-    void* dma_aligned_buffer; /*!< Leave it NULL. Reserved for cache aligned buffers for SDIO mode */
+    /** 
+     * @brief Cache aligned buffer for multi-block RW and IO commands
+     *
+     * Use cases:
+     *  - Temporary buffer for multi-block read/write transactions to/from unaligned buffers.
+     *    Allocate with DMA capable memory, size should be an integer multiple of your card's sector size.
+     *    See also Kconfig option SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE.
+     *  - Cache aligned buffer for IO commands in SDIO mode.
+     *    If you allocate manually, make sure it is at least SDMMC_IO_BLOCK_SIZE bytes large.
+     */
+    void* dma_aligned_buffer;
     sd_pwr_ctrl_handle_t pwr_ctrl_handle;  /*!< Power control handle */
     bool (*check_buffer_alignment)(int slot, const void *buf, size_t size); /*!< Check if buffer meets alignment requirements */
     esp_err_t (*is_slot_set_to_uhs1)(int slot, bool *is_uhs1); /*!< host slot is set to uhs1 or not*/
diff --git a/components/sdmmc/sdmmc_cmd.c b/components/sdmmc/sdmmc_cmd.c
index a928dee6d2ec..2e7f992ea86b 100644
--- a/components/sdmmc/sdmmc_cmd.c
+++ b/components/sdmmc/sdmmc_cmd.c
@@ -5,8 +5,12 @@
  */
 
 #include <inttypes.h>
+#include <sys/param.h> // for MIN/MAX
 #include "esp_private/sdmmc_common.h"
 
+// the maximum size in blocks of the chunks a SDMMC write/read will be split into
+#define MAX_NUM_BLOCKS_PER_MULTI_BLOCK_RW (16u)
+
 static const char* TAG = "sdmmc_cmd";
 
 
@@ -462,31 +466,47 @@ esp_err_t sdmmc_write_sectors(sdmmc_card_t* card, const void* src,
         err = sdmmc_write_sectors_dma(card, src, start_block, block_count, block_size * block_count);
     } else {
         // SDMMC peripheral needs DMA-capable buffers. Split the write into
-        // separate single block writes, if needed, and allocate a temporary
+        // separate (multi) block writes, if needed, and allocate a temporary
         // DMA-capable buffer.
-        void *tmp_buf = NULL;
-        size_t actual_size = 0;
-        // We don't want to force the allocation into SPIRAM, the allocator
-        // will decide based on the buffer size and memory availability.
-        tmp_buf = heap_caps_malloc(block_size, MALLOC_CAP_DMA);
-        if (!tmp_buf) {
-            ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM);
-            return ESP_ERR_NO_MEM;
+        size_t blocks_per_write = MIN(CONFIG_SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE, block_count);
+
+        // prefer using DMA aligned buffer if available over allocating local temporary buffer
+        bool use_dma_aligned_buffer = (card->host.dma_aligned_buffer != NULL);
+        void* buf = use_dma_aligned_buffer ? card->host.dma_aligned_buffer : NULL;
+
+        // only allocate temporary buffer if we can't use the dma_aligned buffer
+        if (!use_dma_aligned_buffer) {
+            // We don't want to force the allocation into SPIRAM, the allocator
+            // will decide based on the buffer size and memory availability.
+            buf = heap_caps_malloc(block_size * blocks_per_write, MALLOC_CAP_DMA);
+            if (!buf) {
+                ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM);
+                return ESP_ERR_NO_MEM;
+            }
+        }
+        size_t actual_size = heap_caps_get_allocated_size(buf);
+        blocks_per_write = actual_size / card->csd.sector_size;
+        if (blocks_per_write == 0) {
+          ESP_LOGE(TAG, "%s: buffer smaller than sector size: buf=%d, sector=%d", actual_size, card->csd.sector_size);
+          return ESP_ERR_INVALID_SIZE;
         }
-        actual_size = heap_caps_get_allocated_size(tmp_buf);
 
         const uint8_t* cur_src = (const uint8_t*) src;
-        for (size_t i = 0; i < block_count; ++i) {
-            memcpy(tmp_buf, cur_src, block_size);
-            cur_src += block_size;
-            err = sdmmc_write_sectors_dma(card, tmp_buf, start_block + i, 1, actual_size);
+        for (size_t i = 0; i < block_count; i += blocks_per_write) {
+            // make sure not to write more than the remaining blocks, i.e. block_count - i
+            blocks_per_write = MIN(blocks_per_write, (block_count - i));
+            memcpy(buf, cur_src, block_size * blocks_per_write);
+            cur_src += block_size * blocks_per_write;
+            err = sdmmc_write_sectors_dma(card, buf, start_block + i, blocks_per_write, actual_size);
             if (err != ESP_OK) {
-                ESP_LOGD(TAG, "%s: error 0x%x writing block %d+%d",
-                        __func__, err, start_block, i);
+                ESP_LOGD(TAG, "%s: error 0x%x writing blocks %d+[%d..%d]",
+                        __func__, err, start_block, i, i + blocks_per_write - 1);
                 break;
             }
         }
-        free(tmp_buf);
+        if (!use_dma_aligned_buffer) {
+            free(buf);
+        }
     }
     return err;
 }
@@ -600,33 +620,50 @@ esp_err_t sdmmc_read_sectors(sdmmc_card_t* card, void* dst,
         err = sdmmc_read_sectors_dma(card, dst, start_block, block_count, block_size * block_count);
     } else {
         // SDMMC peripheral needs DMA-capable buffers. Split the read into
-        // separate single block reads, if needed, and allocate a temporary
+        // separate (multi) block reads, if needed, and allocate a temporary
         // DMA-capable buffer.
-        void *tmp_buf = NULL;
-        size_t actual_size = 0;
-        tmp_buf = heap_caps_malloc(block_size, MALLOC_CAP_DMA);
-        if (!tmp_buf) {
-            ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM);
-            return ESP_ERR_NO_MEM;
+        size_t blocks_per_read = MIN(CONFIG_SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE, block_count);
+
+        // prefer using DMA aligned buffer if available over allocating local temporary buffer
+        bool use_dma_aligned_buffer = (card->host.dma_aligned_buffer != NULL);
+        void* buf = use_dma_aligned_buffer ? card->host.dma_aligned_buffer : NULL;
+
+        // only allocate temporary buffer if we can't use the dma_aligned buffer
+        if (!use_dma_aligned_buffer) {
+            // We don't want to force the allocation into SPIRAM, the allocator
+            // will decide based on the buffer size and memory availability.
+            buf = heap_caps_malloc(block_size * blocks_per_read, MALLOC_CAP_DMA);
+            if (!buf) {
+                ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM);
+                return ESP_ERR_NO_MEM;
+            }
+        }
+        size_t actual_size = heap_caps_get_allocated_size(buf);
+        blocks_per_read = actual_size / card->csd.sector_size;
+        if (blocks_per_read == 0) {
+          ESP_LOGE(TAG, "%s: buffer smaller than sector size: buf=%d, sector=%d", actual_size, card->csd.sector_size);
+          return ESP_ERR_INVALID_SIZE;
         }
-        actual_size = heap_caps_get_allocated_size(tmp_buf);
 
         uint8_t* cur_dst = (uint8_t*) dst;
-        for (size_t i = 0; i < block_count; ++i) {
-            err = sdmmc_read_sectors_dma(card, tmp_buf, start_block + i, 1, actual_size);
+        for (size_t i = 0; i < block_count; i += blocks_per_read) {
+            // make sure not to read more than the remaining blocks, i.e. block_count - i
+            blocks_per_read = MIN(blocks_per_read, (block_count - i));
+            err = sdmmc_read_sectors_dma(card, buf, start_block + i, blocks_per_read, actual_size);
             if (err != ESP_OK) {
-                ESP_LOGD(TAG, "%s: error 0x%x writing block %d+%d",
-                        __func__, err, start_block, i);
+                ESP_LOGD(TAG, "%s: error 0x%x reading blocks %d+[%d..%d]",
+                        __func__, err, start_block, i, i + blocks_per_read - 1);
                 break;
             }
-            memcpy(cur_dst, tmp_buf, block_size);
-            cur_dst += block_size;
+            memcpy(cur_dst, buf, block_size * blocks_per_read);
+            cur_dst += block_size * blocks_per_read;
+        }
+        if (!use_dma_aligned_buffer) {
+          free(buf);
         }
-        free(tmp_buf);
     }
     return err;
 }
-
 esp_err_t sdmmc_read_sectors_dma(sdmmc_card_t* card, void* dst,
         size_t start_block, size_t block_count, size_t buffer_len)
 {