From 8a6bb5b8fcac3b66788f1c5d59f0c890fbf97fb9 Mon Sep 17 00:00:00 2001 From: jofrev Date: Tue, 23 Sep 2025 19:55:20 +0400 Subject: [PATCH 1/4] feat(sdmmc): support multi-block read/writes This has the potential of speeding up SD card access significantly. --- components/sdmmc/sdmmc_cmd.c | 44 +++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/components/sdmmc/sdmmc_cmd.c b/components/sdmmc/sdmmc_cmd.c index a928dee6d2ec..7ab05d39b58e 100644 --- a/components/sdmmc/sdmmc_cmd.c +++ b/components/sdmmc/sdmmc_cmd.c @@ -5,8 +5,12 @@ */ #include +#include // for MIN/MAX #include "esp_private/sdmmc_common.h" +// the maximum size in blocks of the chunks a SDMMC write/read will be split into +#define MAX_NUM_BLOCKS_PER_MULTI_BLOCK_RW (16u) + static const char* TAG = "sdmmc_cmd"; @@ -462,13 +466,14 @@ esp_err_t sdmmc_write_sectors(sdmmc_card_t* card, const void* src, err = sdmmc_write_sectors_dma(card, src, start_block, block_count, block_size * block_count); } else { // SDMMC peripheral needs DMA-capable buffers. Split the write into - // separate single block writes, if needed, and allocate a temporary + // separate (multi) block writes, if needed, and allocate a temporary // DMA-capable buffer. + size_t blocks_per_write = MIN(MAX_NUM_BLOCKS_PER_MULTI_BLOCK_RW, block_count); void *tmp_buf = NULL; size_t actual_size = 0; // We don't want to force the allocation into SPIRAM, the allocator // will decide based on the buffer size and memory availability. - tmp_buf = heap_caps_malloc(block_size, MALLOC_CAP_DMA); + tmp_buf = heap_caps_malloc(block_size * blocks_per_write, MALLOC_CAP_DMA); if (!tmp_buf) { ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM); return ESP_ERR_NO_MEM; @@ -476,13 +481,15 @@ esp_err_t sdmmc_write_sectors(sdmmc_card_t* card, const void* src, actual_size = heap_caps_get_allocated_size(tmp_buf); const uint8_t* cur_src = (const uint8_t*) src; - for (size_t i = 0; i < block_count; ++i) { - memcpy(tmp_buf, cur_src, block_size); - cur_src += block_size; - err = sdmmc_write_sectors_dma(card, tmp_buf, start_block + i, 1, actual_size); + for (size_t i = 0; i < block_count; i += blocks_per_write) { + // make sure not to write more than the remaining blocks, i.e. block_count - i + blocks_per_write = MIN(blocks_per_write, (block_count - i)); + memcpy(tmp_buf, cur_src, block_size * blocks_per_write); + cur_src += block_size * blocks_per_write; + err = sdmmc_write_sectors_dma(card, tmp_buf, start_block + i, blocks_per_write, actual_size); if (err != ESP_OK) { - ESP_LOGD(TAG, "%s: error 0x%x writing block %d+%d", - __func__, err, start_block, i); + ESP_LOGD(TAG, "%s: error 0x%x writing blocks %zu+[%zu..%zu]", + __func__, err, start_block, i, i + blocks_per_write - 1); break; } } @@ -600,11 +607,14 @@ esp_err_t sdmmc_read_sectors(sdmmc_card_t* card, void* dst, err = sdmmc_read_sectors_dma(card, dst, start_block, block_count, block_size * block_count); } else { // SDMMC peripheral needs DMA-capable buffers. Split the read into - // separate single block reads, if needed, and allocate a temporary + // separate (multi) block reads, if needed, and allocate a temporary // DMA-capable buffer. + size_t blocks_per_read = MIN(MAX_NUM_BLOCKS_PER_MULTI_BLOCK_RW, block_count); void *tmp_buf = NULL; size_t actual_size = 0; - tmp_buf = heap_caps_malloc(block_size, MALLOC_CAP_DMA); + // We don't want to force the allocation into SPIRAM, the allocator + // will decide based on the buffer size and memory availability. + tmp_buf = heap_caps_malloc(block_size * blocks_per_read, MALLOC_CAP_DMA); if (!tmp_buf) { ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM); return ESP_ERR_NO_MEM; @@ -612,15 +622,17 @@ esp_err_t sdmmc_read_sectors(sdmmc_card_t* card, void* dst, actual_size = heap_caps_get_allocated_size(tmp_buf); uint8_t* cur_dst = (uint8_t*) dst; - for (size_t i = 0; i < block_count; ++i) { - err = sdmmc_read_sectors_dma(card, tmp_buf, start_block + i, 1, actual_size); + for (size_t i = 0; i < block_count; i += blocks_per_read) { + // make sure not to read more than the remaining blocks, i.e. block_count - i + blocks_per_read = MIN(blocks_per_read, (block_count - i)); + err = sdmmc_read_sectors_dma(card, tmp_buf, start_block + i, blocks_per_read, actual_size); if (err != ESP_OK) { - ESP_LOGD(TAG, "%s: error 0x%x writing block %d+%d", - __func__, err, start_block, i); + ESP_LOGD(TAG, "%s: error 0x%x reading blocks %zu+[%zu..%zu]", + __func__, err, start_block, i, i + blocks_per_read - 1); break; } - memcpy(cur_dst, tmp_buf, block_size); - cur_dst += block_size; + memcpy(cur_dst, tmp_buf, block_size * blocks_per_read); + cur_dst += block_size * blocks_per_read; } free(tmp_buf); } From c6581d0bd934e9f833dbe5a42957c4161c6119fa Mon Sep 17 00:00:00 2001 From: jofrev Date: Sun, 28 Sep 2025 07:59:39 +0400 Subject: [PATCH 2/4] fix %z formatters --- components/sdmmc/sdmmc_cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/sdmmc/sdmmc_cmd.c b/components/sdmmc/sdmmc_cmd.c index 7ab05d39b58e..5a5c8e6f35a1 100644 --- a/components/sdmmc/sdmmc_cmd.c +++ b/components/sdmmc/sdmmc_cmd.c @@ -488,7 +488,7 @@ esp_err_t sdmmc_write_sectors(sdmmc_card_t* card, const void* src, cur_src += block_size * blocks_per_write; err = sdmmc_write_sectors_dma(card, tmp_buf, start_block + i, blocks_per_write, actual_size); if (err != ESP_OK) { - ESP_LOGD(TAG, "%s: error 0x%x writing blocks %zu+[%zu..%zu]", + ESP_LOGD(TAG, "%s: error 0x%x writing blocks %d+[%d..%d]", __func__, err, start_block, i, i + blocks_per_write - 1); break; } @@ -627,7 +627,7 @@ esp_err_t sdmmc_read_sectors(sdmmc_card_t* card, void* dst, blocks_per_read = MIN(blocks_per_read, (block_count - i)); err = sdmmc_read_sectors_dma(card, tmp_buf, start_block + i, blocks_per_read, actual_size); if (err != ESP_OK) { - ESP_LOGD(TAG, "%s: error 0x%x reading blocks %zu+[%zu..%zu]", + ESP_LOGD(TAG, "%s: error 0x%x reading blocks %d+[%d..%d]", __func__, err, start_block, i, i + blocks_per_read - 1); break; } From 3b3d530143e574c2a6df8829421bc043f8ce80cf Mon Sep 17 00:00:00 2001 From: jofrev Date: Sun, 28 Sep 2025 08:03:09 +0400 Subject: [PATCH 3/4] add Kconfig option --- components/sdmmc/Kconfig | 17 +++++++++++++++++ components/sdmmc/sdmmc_cmd.c | 4 ++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/components/sdmmc/Kconfig b/components/sdmmc/Kconfig index 0d2ed43feeec..0371e467b111 100644 --- a/components/sdmmc/Kconfig +++ b/components/sdmmc/Kconfig @@ -5,4 +5,21 @@ menu "SD Protocol Layer Configuration" help Enable SDIO support. Disabling this will skip SDIO-specific initialization steps + config SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE + int "The maximum size of the chunks a SDMMC read/write to/from an unaligned buffer will be split into" + range 1 99999 + default 1 + help + The maximum size in blocks of the chunks a SDMMC read/write with an unaligned buffer will be split into. + The SDMMC driver requires aligned buffers for DMA access. If unaligned buffers are passed, an aligned + temporary buffer must be allocated for the actual transfer. + This option defines the maximum size for the temporary buffer, which equals this option's value multiplied + with the block size (typically 512 Bytes). A value of 16 therefore leads to up to 8192 bytes being + allocated on the heap for each transfer. The allocated buffer will never be larger than the number of bytes + to transfer in total. + It also decides whether single (value == 1) or multi block read/write (value > 1) commands are used. + With the default value of 1 single-block read/write commands will be used with the allocated buffer size + matching the block size. + You should keep this option at 1 if your card or configuration doesn't support the read or write multiple + blocks commands (CMD18 & CMD25). endmenu diff --git a/components/sdmmc/sdmmc_cmd.c b/components/sdmmc/sdmmc_cmd.c index 5a5c8e6f35a1..ec23bbc2ef52 100644 --- a/components/sdmmc/sdmmc_cmd.c +++ b/components/sdmmc/sdmmc_cmd.c @@ -468,7 +468,7 @@ esp_err_t sdmmc_write_sectors(sdmmc_card_t* card, const void* src, // SDMMC peripheral needs DMA-capable buffers. Split the write into // separate (multi) block writes, if needed, and allocate a temporary // DMA-capable buffer. - size_t blocks_per_write = MIN(MAX_NUM_BLOCKS_PER_MULTI_BLOCK_RW, block_count); + size_t blocks_per_write = MIN(CONFIG_SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE, block_count); void *tmp_buf = NULL; size_t actual_size = 0; // We don't want to force the allocation into SPIRAM, the allocator @@ -609,7 +609,7 @@ esp_err_t sdmmc_read_sectors(sdmmc_card_t* card, void* dst, // SDMMC peripheral needs DMA-capable buffers. Split the read into // separate (multi) block reads, if needed, and allocate a temporary // DMA-capable buffer. - size_t blocks_per_read = MIN(MAX_NUM_BLOCKS_PER_MULTI_BLOCK_RW, block_count); + size_t blocks_per_read = MIN(CONFIG_SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE, block_count); void *tmp_buf = NULL; size_t actual_size = 0; // We don't want to force the allocation into SPIRAM, the allocator From 98cbd0a99bb12ac4babeb5360a06714419cda758 Mon Sep 17 00:00:00 2001 From: jofrev Date: Mon, 6 Oct 2025 08:26:50 +0400 Subject: [PATCH 4/4] allow users to provide DMA aligned buffer Reusing this buffer gives the additional option of only having to allocate the transaction buffer once instead of for every transaction, while still keeping the improved transaction time. To give users the maximum amount of control, the Kconfig option for the transaction buffer size applies only to the temporary buffer, which is only allocated if the DMA aligned buffer has not been pre-allocated. --- components/sdmmc/Kconfig | 4 +- components/sdmmc/include/sd_protocol_types.h | 12 +++- components/sdmmc/sdmmc_cmd.c | 75 +++++++++++++------- 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/components/sdmmc/Kconfig b/components/sdmmc/Kconfig index 0371e467b111..0acfebdc4da0 100644 --- a/components/sdmmc/Kconfig +++ b/components/sdmmc/Kconfig @@ -11,8 +11,8 @@ menu "SD Protocol Layer Configuration" default 1 help The maximum size in blocks of the chunks a SDMMC read/write with an unaligned buffer will be split into. - The SDMMC driver requires aligned buffers for DMA access. If unaligned buffers are passed, an aligned - temporary buffer must be allocated for the actual transfer. + The SDMMC driver requires aligned buffers for DMA access. If unaligned buffers are passed and the host's + dma_aligned_buffer is NULL, an aligned temporary buffer must be allocated for the actual transfer. This option defines the maximum size for the temporary buffer, which equals this option's value multiplied with the block size (typically 512 Bytes). A value of 16 therefore leads to up to 8192 bytes being allocated on the heap for each transfer. The allocated buffer will never be larger than the number of bytes diff --git a/components/sdmmc/include/sd_protocol_types.h b/components/sdmmc/include/sd_protocol_types.h index d7c5196278db..1e0595c3a2b1 100644 --- a/components/sdmmc/include/sd_protocol_types.h +++ b/components/sdmmc/include/sd_protocol_types.h @@ -224,7 +224,17 @@ typedef struct { sdmmc_delay_phase_t input_delay_phase; /*!< input delay phase, this will only take into effect when the host works in SDMMC_FREQ_HIGHSPEED or SDMMC_FREQ_52M. Driver will print out how long the delay is*/ esp_err_t (*set_input_delay)(int slot, sdmmc_delay_phase_t delay_phase); /*!< set input delay phase */ esp_err_t (*set_input_delayline)(int slot, sdmmc_delay_line_t delay_line); /*!< set input delay line */ - void* dma_aligned_buffer; /*!< Leave it NULL. Reserved for cache aligned buffers for SDIO mode */ + /** + * @brief Cache aligned buffer for multi-block RW and IO commands + * + * Use cases: + * - Temporary buffer for multi-block read/write transactions to/from unaligned buffers. + * Allocate with DMA capable memory, size should be an integer multiple of your card's sector size. + * See also Kconfig option SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE. + * - Cache aligned buffer for IO commands in SDIO mode. + * If you allocate manually, make sure it is at least SDMMC_IO_BLOCK_SIZE bytes large. + */ + void* dma_aligned_buffer; sd_pwr_ctrl_handle_t pwr_ctrl_handle; /*!< Power control handle */ bool (*check_buffer_alignment)(int slot, const void *buf, size_t size); /*!< Check if buffer meets alignment requirements */ esp_err_t (*is_slot_set_to_uhs1)(int slot, bool *is_uhs1); /*!< host slot is set to uhs1 or not*/ diff --git a/components/sdmmc/sdmmc_cmd.c b/components/sdmmc/sdmmc_cmd.c index ec23bbc2ef52..2e7f992ea86b 100644 --- a/components/sdmmc/sdmmc_cmd.c +++ b/components/sdmmc/sdmmc_cmd.c @@ -469,31 +469,44 @@ esp_err_t sdmmc_write_sectors(sdmmc_card_t* card, const void* src, // separate (multi) block writes, if needed, and allocate a temporary // DMA-capable buffer. size_t blocks_per_write = MIN(CONFIG_SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE, block_count); - void *tmp_buf = NULL; - size_t actual_size = 0; - // We don't want to force the allocation into SPIRAM, the allocator - // will decide based on the buffer size and memory availability. - tmp_buf = heap_caps_malloc(block_size * blocks_per_write, MALLOC_CAP_DMA); - if (!tmp_buf) { - ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM); - return ESP_ERR_NO_MEM; + + // prefer using DMA aligned buffer if available over allocating local temporary buffer + bool use_dma_aligned_buffer = (card->host.dma_aligned_buffer != NULL); + void* buf = use_dma_aligned_buffer ? card->host.dma_aligned_buffer : NULL; + + // only allocate temporary buffer if we can't use the dma_aligned buffer + if (!use_dma_aligned_buffer) { + // We don't want to force the allocation into SPIRAM, the allocator + // will decide based on the buffer size and memory availability. + buf = heap_caps_malloc(block_size * blocks_per_write, MALLOC_CAP_DMA); + if (!buf) { + ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM); + return ESP_ERR_NO_MEM; + } + } + size_t actual_size = heap_caps_get_allocated_size(buf); + blocks_per_write = actual_size / card->csd.sector_size; + if (blocks_per_write == 0) { + ESP_LOGE(TAG, "%s: buffer smaller than sector size: buf=%d, sector=%d", actual_size, card->csd.sector_size); + return ESP_ERR_INVALID_SIZE; } - actual_size = heap_caps_get_allocated_size(tmp_buf); const uint8_t* cur_src = (const uint8_t*) src; for (size_t i = 0; i < block_count; i += blocks_per_write) { // make sure not to write more than the remaining blocks, i.e. block_count - i blocks_per_write = MIN(blocks_per_write, (block_count - i)); - memcpy(tmp_buf, cur_src, block_size * blocks_per_write); + memcpy(buf, cur_src, block_size * blocks_per_write); cur_src += block_size * blocks_per_write; - err = sdmmc_write_sectors_dma(card, tmp_buf, start_block + i, blocks_per_write, actual_size); + err = sdmmc_write_sectors_dma(card, buf, start_block + i, blocks_per_write, actual_size); if (err != ESP_OK) { ESP_LOGD(TAG, "%s: error 0x%x writing blocks %d+[%d..%d]", __func__, err, start_block, i, i + blocks_per_write - 1); break; } } - free(tmp_buf); + if (!use_dma_aligned_buffer) { + free(buf); + } } return err; } @@ -610,35 +623,47 @@ esp_err_t sdmmc_read_sectors(sdmmc_card_t* card, void* dst, // separate (multi) block reads, if needed, and allocate a temporary // DMA-capable buffer. size_t blocks_per_read = MIN(CONFIG_SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE, block_count); - void *tmp_buf = NULL; - size_t actual_size = 0; - // We don't want to force the allocation into SPIRAM, the allocator - // will decide based on the buffer size and memory availability. - tmp_buf = heap_caps_malloc(block_size * blocks_per_read, MALLOC_CAP_DMA); - if (!tmp_buf) { - ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM); - return ESP_ERR_NO_MEM; + + // prefer using DMA aligned buffer if available over allocating local temporary buffer + bool use_dma_aligned_buffer = (card->host.dma_aligned_buffer != NULL); + void* buf = use_dma_aligned_buffer ? card->host.dma_aligned_buffer : NULL; + + // only allocate temporary buffer if we can't use the dma_aligned buffer + if (!use_dma_aligned_buffer) { + // We don't want to force the allocation into SPIRAM, the allocator + // will decide based on the buffer size and memory availability. + buf = heap_caps_malloc(block_size * blocks_per_read, MALLOC_CAP_DMA); + if (!buf) { + ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM); + return ESP_ERR_NO_MEM; + } + } + size_t actual_size = heap_caps_get_allocated_size(buf); + blocks_per_read = actual_size / card->csd.sector_size; + if (blocks_per_read == 0) { + ESP_LOGE(TAG, "%s: buffer smaller than sector size: buf=%d, sector=%d", actual_size, card->csd.sector_size); + return ESP_ERR_INVALID_SIZE; } - actual_size = heap_caps_get_allocated_size(tmp_buf); uint8_t* cur_dst = (uint8_t*) dst; for (size_t i = 0; i < block_count; i += blocks_per_read) { // make sure not to read more than the remaining blocks, i.e. block_count - i blocks_per_read = MIN(blocks_per_read, (block_count - i)); - err = sdmmc_read_sectors_dma(card, tmp_buf, start_block + i, blocks_per_read, actual_size); + err = sdmmc_read_sectors_dma(card, buf, start_block + i, blocks_per_read, actual_size); if (err != ESP_OK) { ESP_LOGD(TAG, "%s: error 0x%x reading blocks %d+[%d..%d]", __func__, err, start_block, i, i + blocks_per_read - 1); break; } - memcpy(cur_dst, tmp_buf, block_size * blocks_per_read); + memcpy(cur_dst, buf, block_size * blocks_per_read); cur_dst += block_size * blocks_per_read; } - free(tmp_buf); + if (!use_dma_aligned_buffer) { + free(buf); + } } return err; } - esp_err_t sdmmc_read_sectors_dma(sdmmc_card_t* card, void* dst, size_t start_block, size_t block_count, size_t buffer_len) {