Merge branch 'contrib/github_pr_17642' into 'master'

feat(sdmmc): support multi-block read/writes (GitHub PR) Closes IDFGH-16505 and DOC-13947 See merge request espressif/esp-idf!45725
2026-04-27 11:03:11 +00:00 · 2026-03-18 22:03:02 +08:00
parent b644580c68 366700747d
commit 510adaf71e
16 changed files with 327 additions and 39 deletions
@@ -48,6 +48,7 @@ extern "C" {
    .input_delay_phase = SDMMC_DELAY_PHASE_0, \
    .set_input_delay = &sdmmc_host_set_input_delay, \
    .set_input_delayline = &sdmmc_host_set_input_delayline, \
+    .unaligned_multi_block_rw_max_chunk_size = 16, \
    .dma_aligned_buffer = NULL, \
    .pwr_ctrl_handle = NULL, \
    .check_buffer_alignment = &sdmmc_host_check_buffer_alignment, \
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2023-2025 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2023-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@@ -56,6 +56,23 @@ void sdmmc_test_rw_with_offset(sdmmc_card_t* card);
 */
 void sdmmc_test_rw_highprio_task(sdmmc_card_t* card);

+/**
+ * @brief Test multi-block read/write with unaligned buffers
+ *
+ * This function verifies that multi-block chunked transfers work correctly
+ * when the source/destination buffers are not DMA-aligned. It exercises:
+ * - Multi-block unaligned writes and reads with more blocks than the configured chunk size
+ * - The dma_aligned_buffer reuse path (when card->host.dma_aligned_buffer is pre-allocated)
+ *
+ * This test function works both with SDMMC and SDSPI hosts.
+ *
+ * @param card Pointer to the card object, must be initialized before calling this function.
+ * @param chunk_size Maximum number of blocks to transfer at once when using an
+ *                   unaligned bounce buffer. This value is written to
+ *                   card->host.unaligned_multi_block_rw_max_chunk_size before the test runs.
+ */
+void sdmmc_test_rw_unaligned_buffer_multiblock(sdmmc_card_t* card, size_t chunk_size);
+
 #ifdef __cplusplus
 };
 #endif
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2022-2025 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2022-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@@ -221,3 +221,77 @@ void sdmmc_test_rw_highprio_task(sdmmc_card_t* card)
    vSemaphoreDelete(args.stop);
    vSemaphoreDelete(args.done);
 }
+
+void sdmmc_test_rw_unaligned_buffer_multiblock(sdmmc_card_t* card, size_t chunk_size)
+{
+    const size_t block_size = card->csd.sector_size;
+    /* Use 10 blocks so that with chunk_size=4,
+     * the transfer is split into chunks: 4 + 4 + 2 blocks */
+    const size_t block_count = 10;
+    const size_t buffer_size = block_size * block_count;
+    const size_t extra = 4;
+    const size_t total_alloc = buffer_size + extra;
+
+    /* Apply the chunk_size to the card's host config */
+    card->host.unaligned_multi_block_rw_max_chunk_size = chunk_size;
+
+    uint8_t *buffer = heap_caps_malloc(total_alloc, MALLOC_CAP_DMA);
+    TEST_ASSERT_NOT_NULL(buffer);
+
+    printf("Testing multi-block unaligned R/W: %d blocks, chunk_size=%d\n",
+           (int)block_count, (int)chunk_size);
+
+    /* Test A: Multi-block unaligned write, then unaligned read.
+     * The +1 offset makes the buffer unaligned, forcing the bounce-buffer
+     * chunking path in sdmmc_write_sectors / sdmmc_read_sectors. */
+    const uint32_t seed_a = 0x12345678;
+    fill_buffer(seed_a, buffer + 1, buffer_size / sizeof(uint32_t));
+    TEST_ESP_OK(sdmmc_write_sectors(card, buffer + 1, 0, block_count));
+    memset(buffer, 0xcc, total_alloc);
+    TEST_ESP_OK(sdmmc_read_sectors(card, buffer + 1, 0, block_count));
+    check_buffer(seed_a, buffer + 1, buffer_size / sizeof(uint32_t));
+
+    /* Test B: Aligned write, then unaligned read — verifies read chunking path. */
+    const uint32_t seed_b = 0xdeadbeef;
+    fill_buffer(seed_b, buffer, buffer_size / sizeof(uint32_t));
+    TEST_ESP_OK(sdmmc_write_sectors(card, buffer, 0, block_count));
+    memset(buffer, 0xcc, total_alloc);
+    TEST_ESP_OK(sdmmc_read_sectors(card, buffer + 1, 0, block_count));
+    check_buffer(seed_b, buffer + 1, buffer_size / sizeof(uint32_t));
+
+    /* Test C: Unaligned write, then aligned read — verifies write chunking path. */
+    const uint32_t seed_c = 0xcafebabe;
+    fill_buffer(seed_c, buffer + 1, buffer_size / sizeof(uint32_t));
+    TEST_ESP_OK(sdmmc_write_sectors(card, buffer + 1, 8, block_count));
+    memset(buffer, 0xcc, total_alloc);
+    TEST_ESP_OK(sdmmc_read_sectors(card, buffer, 8, block_count));
+    check_buffer(seed_c, buffer, buffer_size / sizeof(uint32_t));
+
+    /* Test D: dma_aligned_buffer reuse path.
+     * Pre-set card->host.dma_aligned_buffer so sdmmc_write/read_sectors
+     * uses it instead of allocating a temporary buffer. */
+    void *orig_dma_buf = card->host.dma_aligned_buffer;
+    size_t chunk_blocks = chunk_size;
+    if (chunk_blocks < block_count) {
+        /* Allocate a DMA-capable buffer large enough for chunk_blocks */
+        void *dma_buf = heap_caps_malloc(block_size * chunk_blocks, MALLOC_CAP_DMA);
+        TEST_ASSERT_NOT_NULL(dma_buf);
+        card->host.dma_aligned_buffer = dma_buf;
+
+        printf("Testing dma_aligned_buffer reuse path (%d block buffer)\n", (int)chunk_blocks);
+
+        const uint32_t seed_d = 0xfeedface;
+        fill_buffer(seed_d, buffer + 1, buffer_size / sizeof(uint32_t));
+        TEST_ESP_OK(sdmmc_write_sectors(card, buffer + 1, 0, block_count));
+        memset(buffer, 0xcc, total_alloc);
+        TEST_ESP_OK(sdmmc_read_sectors(card, buffer + 1, 0, block_count));
+        check_buffer(seed_d, buffer + 1, buffer_size / sizeof(uint32_t));
+
+        card->host.dma_aligned_buffer = orig_dma_buf;
+        free(dma_buf);
+    } else {
+        printf("Skipping dma_aligned_buffer reuse test (chunk_size >= block_count)\n");
+    }
+
+    free(buffer);
+}
@@ -61,6 +61,7 @@ typedef int sdspi_dev_handle_t;
    .input_delay_phase = SDMMC_DELAY_PHASE_0, \
    .set_input_delay = NULL, \
    .set_input_delayline = NULL, \
+    .unaligned_multi_block_rw_max_chunk_size = 16, \
    .dma_aligned_buffer = NULL, \
    .pwr_ctrl_handle = NULL, \
    .check_buffer_alignment = sdspi_host_check_buffer_alignment, \
@@ -3,7 +3,7 @@
 *
 * SPDX-License-Identifier: ISC
 *
- * SPDX-FileContributor: 2016-2025 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileContributor: 2016-2026 Espressif Systems (Shanghai) CO LTD
 */
 /*
 * Copyright (c) 2006 Uwe Stuehler <uwe@openbsd.org>
@@ -225,7 +225,27 @@ typedef struct {
    sdmmc_delay_phase_t input_delay_phase; /*!< input delay phase, this will only take into effect when the host works in SDMMC_FREQ_HIGHSPEED or SDMMC_FREQ_52M. Driver will print out how long the delay is*/
    esp_err_t (*set_input_delay)(int slot, sdmmc_delay_phase_t delay_phase); /*!< set input delay phase */
    esp_err_t (*set_input_delayline)(int slot, sdmmc_delay_line_t delay_line); /*!< set input delay line */
-    void* dma_aligned_buffer; /*!< Leave it NULL. Reserved for cache aligned buffers for SDIO mode */
+    /**
+     * @brief Maximum number of blocks to read/write at once when using an unaligned buffer.
+     *
+     * When a multi-block read/write is requested with an unaligned buffer, the driver
+     * splits the transfer into chunks of this many blocks. Set to 0 to use the default
+     * value of 1 (single-block transfers). Higher values improve throughput but require
+     * a larger DMA-capable temporary buffer.
+     */
+    size_t unaligned_multi_block_rw_max_chunk_size;
+    /**
+     * @brief Cache aligned buffer for multi-block RW and IO commands
+     *
+     * Use cases:
+     *  - Temporary buffer for multi-block read/write transactions to/from unaligned buffers.
+     *    Allocate with DMA capable memory, size should be an integer multiple of your card's sector size.
+     *    The number of blocks transferred per chunk is controlled by
+     *    `unaligned_multi_block_rw_max_chunk_size`.
+     *  - Cache aligned buffer for IO commands in SDIO mode.
+     *    If you allocate manually, make sure it is at least SDMMC_IO_BLOCK_SIZE bytes large.
+     */
+    void* dma_aligned_buffer;
    sd_pwr_ctrl_handle_t pwr_ctrl_handle;  /*!< Power control handle */
    bool (*check_buffer_alignment)(int slot, const void *buf, size_t size); /*!< Check if buffer meets alignment requirements */
    esp_err_t (*is_slot_set_to_uhs1)(int slot, bool *is_uhs1); /*!< host slot is set to uhs1 or not*/
@@ -6,10 +6,43 @@

 #include <inttypes.h>
 #include "freertos/FreeRTOS.h"
+#include <sys/param.h> // for MIN/MAX
 #include "esp_private/sdmmc_common.h"

 static const char* TAG = "sdmmc_cmd";

+/**
+ * @brief Get the effective chunk size for unaligned multi-block transfers.
+ *
+ * Returns the configured value, or 1 (single-block) when the field is left
+ * at its zero-initialised default.
+ */
+static inline size_t get_chunk_size(const sdmmc_card_t *card)
+{
+    size_t chunk_size = card->host.unaligned_multi_block_rw_max_chunk_size;
+    return (chunk_size != 0) ? chunk_size : 1;
+}
+
+static esp_err_t allocate_dma_buf(size_t* actual_size, size_t block_size, void **buf)
+{
+    if (actual_size == NULL || buf == NULL) {
+        return ESP_ERR_INVALID_ARG;
+    }
+
+    size_t size = *actual_size;
+    do {
+        if (*actual_size < block_size) {
+            ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM);
+            return ESP_ERR_NO_MEM;
+        }
+        *buf = heap_caps_malloc(*actual_size, MALLOC_CAP_DMA);
+        if (!*buf) {
+            *actual_size /= 2;
+            ESP_LOGD(TAG, "%s: required space for buffer of size %d not available, trying again with size %zu", __func__, size, *actual_size);
+        }
+    } while (!*buf);
+    return ESP_OK;
+}

 esp_err_t sdmmc_send_cmd(sdmmc_card_t* card, sdmmc_command_t* cmd)
 {
@@ -463,31 +496,54 @@ esp_err_t sdmmc_write_sectors(sdmmc_card_t* card, const void* src,
        err = sdmmc_write_sectors_dma(card, src, start_block, block_count, block_size * block_count);
    } else {
        // SDMMC peripheral needs DMA-capable buffers. Split the write into
-        // separate single block writes, if needed, and allocate a temporary
+        // separate (multi) block writes, if needed, and allocate a temporary
        // DMA-capable buffer.
-        void *tmp_buf = NULL;
-        size_t actual_size = 0;
-        // We don't want to force the allocation into SPIRAM, the allocator
-        // will decide based on the buffer size and memory availability.
-        tmp_buf = heap_caps_malloc(block_size, MALLOC_CAP_DMA);
-        if (!tmp_buf) {
-            ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM);
-            return ESP_ERR_NO_MEM;
+        size_t chunk_size = get_chunk_size(card);
+        size_t blocks_per_write = MIN(chunk_size, block_count);
+
+        // prefer using DMA aligned buffer if available over allocating local temporary buffer
+        bool use_dma_aligned_buffer = (card->host.dma_aligned_buffer != NULL);
+        void* buf = card->host.dma_aligned_buffer;
+
+        size_t actual_size = block_size * blocks_per_write;
+        if (!use_dma_aligned_buffer) {
+            // Allocate a temporary DMA-capable buffer.
+            // We don't want to force the allocation into SPIRAM, the allocator
+            // will decide based on the buffer size and memory availability.
+            // We start with the largest buffer possible to minimize the number of read iterations, but if that fails, we try smaller sizes down to a single block.
+            err = allocate_dma_buf(&actual_size, block_size, &buf);
+            if (err != ESP_OK) {
+                return err;
+            }
+            blocks_per_write = actual_size / block_size;
+        } else {
+            // Check that the provided dma_aligned_buffer is large enough
+            actual_size = heap_caps_get_allocated_size(buf);
+            blocks_per_write = actual_size / card->csd.sector_size;
+            size_t chunk_size = get_chunk_size(card);
+            blocks_per_write = MIN(chunk_size, blocks_per_write);
+            if (blocks_per_write == 0) {
+                ESP_LOGE(TAG, "%s: buffer smaller than sector size: buf=%d, sector=%d", __func__, actual_size, card->csd.sector_size);
+                return ESP_ERR_INVALID_SIZE;
+            }
        }
-        actual_size = heap_caps_get_allocated_size(tmp_buf);

        const uint8_t* cur_src = (const uint8_t*) src;
-        for (size_t i = 0; i < block_count; ++i) {
-            memcpy(tmp_buf, cur_src, block_size);
-            cur_src += block_size;
-            err = sdmmc_write_sectors_dma(card, tmp_buf, start_block + i, 1, actual_size);
+        for (size_t i = 0; i < block_count; i += blocks_per_write) {
+            // make sure not to write more than the remaining blocks, i.e. block_count - i
+            blocks_per_write = MIN(blocks_per_write, (block_count - i));
+            memcpy(buf, cur_src, block_size * blocks_per_write);
+            cur_src += block_size * blocks_per_write;
+            err = sdmmc_write_sectors_dma(card, buf, start_block + i, blocks_per_write, actual_size);
            if (err != ESP_OK) {
-                ESP_LOGD(TAG, "%s: error 0x%x writing block %d+%d",
-                        __func__, err, start_block, i);
+                ESP_LOGD(TAG, "%s: error 0x%x writing blocks %d+[%d..%d]",
+                        __func__, err, start_block, i, i + blocks_per_write - 1);
                break;
            }
        }
-        free(tmp_buf);
+        if (!use_dma_aligned_buffer) {
+            free(buf);
+        }
    }
    return err;
 }
@@ -603,33 +659,57 @@ esp_err_t sdmmc_read_sectors(sdmmc_card_t* card, void* dst,
        err = sdmmc_read_sectors_dma(card, dst, start_block, block_count, block_size * block_count);
    } else {
        // SDMMC peripheral needs DMA-capable buffers. Split the read into
-        // separate single block reads, if needed, and allocate a temporary
+        // separate (multi) block reads, if needed, and allocate a temporary
        // DMA-capable buffer.
-        void *tmp_buf = NULL;
-        size_t actual_size = 0;
-        tmp_buf = heap_caps_malloc(block_size, MALLOC_CAP_DMA);
-        if (!tmp_buf) {
-            ESP_LOGE(TAG, "%s: not enough mem, err=0x%x", __func__, ESP_ERR_NO_MEM);
-            return ESP_ERR_NO_MEM;
+        size_t chunk_size = get_chunk_size(card);
+        size_t blocks_per_read = MIN(chunk_size, block_count);
+
+        // prefer using DMA aligned buffer if available over allocating local temporary buffer
+        bool use_dma_aligned_buffer = (card->host.dma_aligned_buffer != NULL);
+        void* buf = card->host.dma_aligned_buffer;
+
+        size_t actual_size = block_size * blocks_per_read;
+        if (!use_dma_aligned_buffer) {
+            // Allocate a temporary DMA-capable buffer.
+            // We don't want to force the allocation into SPIRAM, the allocator
+            // will decide based on the buffer size and memory availability.
+            // We start with the largest buffer possible to minimize the number of read iterations, but if that fails, we try smaller sizes down to a single block.
+            err = allocate_dma_buf(&actual_size, block_size, &buf);
+            if (err != ESP_OK) {
+                return err;
+            }
+            blocks_per_read = actual_size / block_size;
+        } else {
+            // Check that the provided dma_aligned_buffer is large enough
+            actual_size = heap_caps_get_allocated_size(buf);
+            blocks_per_read = actual_size / card->csd.sector_size;
+            size_t chunk_size = get_chunk_size(card);
+            blocks_per_read = MIN(chunk_size, blocks_per_read);
+            if (blocks_per_read == 0) {
+                ESP_LOGE(TAG, "%s: buffer smaller than sector size: buf=%d, sector=%d", __func__, actual_size, card->csd.sector_size);
+                return ESP_ERR_INVALID_SIZE;
+            }
        }
-        actual_size = heap_caps_get_allocated_size(tmp_buf);

        uint8_t* cur_dst = (uint8_t*) dst;
-        for (size_t i = 0; i < block_count; ++i) {
-            err = sdmmc_read_sectors_dma(card, tmp_buf, start_block + i, 1, actual_size);
+        for (size_t i = 0; i < block_count; i += blocks_per_read) {
+            // make sure not to read more than the remaining blocks, i.e. block_count - i
+            blocks_per_read = MIN(blocks_per_read, (block_count - i));
+            err = sdmmc_read_sectors_dma(card, buf, start_block + i, blocks_per_read, actual_size);
            if (err != ESP_OK) {
-                ESP_LOGD(TAG, "%s: error 0x%x writing block %d+%d",
-                        __func__, err, start_block, i);
+                ESP_LOGE(TAG, "%s: error 0x%x reading blocks %d+[%d..%d]",
+                        __func__, err, start_block, i, i + blocks_per_read - 1);
                break;
            }
-            memcpy(cur_dst, tmp_buf, block_size);
-            cur_dst += block_size;
+            memcpy(cur_dst, buf, block_size * blocks_per_read);
+            cur_dst += block_size * blocks_per_read;
+        }
+        if (!use_dma_aligned_buffer) {
+            free(buf);
        }
-        free(tmp_buf);
    }
    return err;
 }
-
 esp_err_t sdmmc_read_sectors_dma(sdmmc_card_t* card, void* dst,
        size_t start_block, size_t block_count, size_t buffer_len)
 {
@@ -1,5 +1,5 @@
 idf_component_register(SRCS "test_sdmmc_app.c"
                       PRIV_INCLUDE_DIRS "."
-                       PRIV_REQUIRES esp_blockdev unity sdmmc esp_driver_sdmmc sdmmc_tests
+                       PRIV_REQUIRES esp_blockdev unity sdmmc esp_driver_sdmmc sdmmc_tests common_test_flows
                       WHOLE_ARCHIVE
                      )
@@ -12,6 +12,7 @@
 #include "sd_protocol_defs.h"
 #include "sdmmc_cmd.h"
 #include "sdmmc_test_begin_end_sd.h"
+#include "sdmmc_test_rw_common.h"
 #include "esp_blockdev.h"

 TEST_GROUP(sdmmc);
@@ -69,9 +70,22 @@ TEST(sdmmc, test_bdl_interface)
    sdmmc_test_sd_end(&card);
 }

+TEST(sdmmc, test_multiblock_unaligned_rw)
+{
+    sdmmc_card_t card;
+    int slot = 1;
+    int width = 1;
+    int freq_khz = SDMMC_FREQ_DEFAULT;
+    sdmmc_test_sd_skip_if_board_incompatible(slot, width, freq_khz, 0, 0);
+    sdmmc_test_sd_begin(slot, width, freq_khz, 0, &card);
+    sdmmc_test_rw_unaligned_buffer_multiblock(&card, 4);
+    sdmmc_test_sd_end(&card);
+}
+
 TEST_GROUP_RUNNER(sdmmc)
 {
    RUN_TEST_CASE(sdmmc, test_bdl_interface)
+    RUN_TEST_CASE(sdmmc, test_multiblock_unaligned_rw)
 }

 void app_main(void)
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
+# SPDX-FileCopyrightText: 2025-2026 Espressif Systems (Shanghai) CO LTD
 # SPDX-License-Identifier: Apache-2.0
 import pytest
 from pytest_embedded import Dut
@@ -6,6 +6,7 @@ from pytest_embedded_idf.utils import idf_parametrize


@pytest.mark.sdcard
+@idf_parametrize('config', ['default'], indirect=['config'])
@idf_parametrize('target', ['esp32'], indirect=['target'])
 def test_sdmmc_extra(dut: Dut) -> None:
    dut.expect_unity_test_output()
@@ -56,6 +56,19 @@ Using API with SD Memory Cards

    - If the card is not used anymore, call the host driver function to disable the host peripheral and free the resources allocated by the driver (``sdmmc_host_deinit`` for SDMMC or ``sdspi_host_deinit`` for SDSPI).

+Unaligned Buffer Performance
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When buffers passed to :cpp:func:`sdmmc_read_sectors` or :cpp:func:`sdmmc_write_sectors` are not DMA-capable (e.g., allocated in PSRAM), the driver copies data through a temporary DMA-capable buffer. By default, this is done one block at a time using single-block transfer commands.
+
+To improve throughput in this scenario, set the :cpp:member:`sdmmc_host_t::unaligned_multi_block_rw_max_chunk_size` field to a value greater than 1. This enables multi-block transfer commands (CMD18/CMD25), which can significantly reduce transfer overhead. The trade-off is higher heap usage (buffer size = N × block size, where N is the configured value and the block size is typically 512 bytes). When this field is 0 (default), the driver falls back to single-block transfers (equivalent to 1). Values greater than 1 are recommended to be a multiple of the block size (e.g., 2, 4, 8, 16 or 32) for best performance.
+
+.. note::
+
+    Keep this value at 0 or 1 if your card or configuration does not support multi-block read/write commands (CMD18 and CMD25).
+
+Alternatively, a pre-allocated DMA-capable buffer can be provided via the :cpp:member:`sdmmc_host_t::dma_aligned_buffer` field. This avoids per-transfer heap allocations and allows the driver to reuse the same buffer across transfers. The buffer must be at least one sector in size (typically 512 bytes) and should ideally be a multiple of the sector size.
+
 .. only:: not SOC_SDMMC_HOST_SUPPORTED

    eMMC Support
@@ -56,6 +56,19 @@ SD/SDIO/MMC 驱动支持 SD 存储器、SDIO 卡和 eMMC 芯片。这是一个

    - 如果不再使用该卡，请调用主机驱动函数，例如 ``sdmmc_host_deinit`` 或 ``sdspi_host_deinit``，以禁用SDMMC 主机外设或 SDSPI 主机外设，并释放驱动程序分配的资源。

+未对齐 buffer 性能
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+当传递给 :cpp:func:`sdmmc_read_sectors` 或 :cpp:func:`sdmmc_write_sectors` 的 buffer 不支持 DMA（例如，分配在 PSRAM 中）时，驱动会通过一个临时的支持 DMA 的 buffer 来复制数据。默认情况下，该操作会使用单块传输命令一次传输一个块。
+
+若要在此场景下提升吞吐量，可将 :cpp:member:`sdmmc_host_t::unaligned_multi_block_rw_max_chunk_size` 字段设置为大于 1 的值。驱动将启用多块传输命令 (CMD18/CMD25)，从而显著降低传输开销，但堆内存占用会相应增加（buffer 大小 = N × 块大小，其中 N 为配置值，块大小通常为 512 字节）。当该字段为 0（默认值）时，驱动回退至单块传输模式（等效于 1）。建议将大于 1 的配置值设置为块大小的整数倍（如 2、4、8、16 或 32）以获得最佳性能。
+
+.. note::
+
+    如果你的 SD 卡或配置不支持多块读写命令（CMD18 和 CMD25），请将该选项值保持为 0 或 1。
+
+或者，可以通过 :cpp:member:`sdmmc_host_t::dma_aligned_buffer` 字段提供一个预先分配的、支持 DMA 的 buffer。这样可以避免每次传输时都分配堆内存，驱动也能在多次传输间复用该 buffer。该 buffer 的大小必须至少为一个扇区（通常为 512 字节），理想情况下应为扇区大小的整数倍。
+
 .. only:: not SOC_SDMMC_HOST_SUPPORTED

    eMMC 芯片支持
@@ -293,6 +293,50 @@ menu "Performance Benchmark Example Configuration"
            help
                Please read the schematic first and input your LDO ID.

+
+        choice
+            prompt "The maximum size of the chunks a SDMMC read/write to/from an unaligned buffer will be split into"
+            default SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_32
+            help
+                The maximum size in blocks of the chunks a SDMMC read/write with an unaligned buffer will be split
+                into.
+                The SDMMC driver requires aligned buffers for DMA access. If unaligned buffers are passed and the
+                host's dma_aligned_buffer is NULL, an aligned temporary buffer must be allocated for the actual
+                transfer.
+                This option defines the maximum size for the temporary buffer, which equals this option's value
+                multiplied with the block size (typically 512 bytes). A value of 16 therefore leads to up to 8192
+                bytes being allocated on the heap for each transfer. The allocated buffer will never be larger than
+                the number of bytes to transfer in total.
+                It also decides whether single (value == 1) or multi block read/write (value > 1) commands are used.
+                With the default value of 1, single-block read/write commands will be used with the allocated buffer
+                size matching the block size.
+                You should keep this option at 1 if your card or configuration doesn't support the read or write
+                multiple blocks commands (CMD18 & CMD25).
+
+            config SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_1
+                bool "1"
+            config SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_2
+                bool "2"
+            config SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_4
+                bool "4"
+            config SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_8
+                bool "8"
+            config SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_16
+                bool "16"
+            config SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_32
+                bool "32"
+        endchoice
+
+        config EXAMPLE_SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE
+            int
+            default 1 if SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_1
+            default 2 if SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_2
+            default 4 if SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_4
+            default 8 if SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_8
+            default 16 if SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_16
+            default 32 if SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE_32
+            default 1
+
    endmenu  # "SD card test config"

 endmenu  # "Performance Monitor Example Configuration"
@@ -60,6 +60,10 @@ void init_sd_config(sdmmc_host_t *out_host, sdspi_device_config_t *out_slot_conf
    ESP_LOGI(TAG, "Using SDMMC peripheral");
    sdmmc_host_t host = SDMMC_HOST_DEFAULT();
    host.max_freq_khz = freq_khz;
+    // Set the chunk size for unaligned multi-block read/write operations to N blocks.
+    // This is to improve performance when the buffer is not aligned or the size is not a multiple of the SD card's block size.
+    // I.e. performance uplift for "write/read more/less than..." test cases - e.g. an equivalent situation to appending to a file.
+    host.unaligned_multi_block_rw_max_chunk_size = CONFIG_EXAMPLE_SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE;

    // This initializes the slot without card detect (CD) and write protect (WP) signals.
    // Modify slot_config.gpio_cd and slot_config.gpio_wp if your board has these signals.
@@ -101,6 +105,10 @@ void init_sd_config(sdmmc_host_t *out_host, sdspi_device_config_t *out_slot_conf
    ESP_LOGI(TAG, "Using SPI peripheral");
    sdmmc_host_t host = SDSPI_HOST_DEFAULT();
    host.max_freq_khz = freq_khz;
+    // Set the chunk size for unaligned multi-block read/write operations to N blocks.
+    // This is to improve performance when the buffer is not aligned or the size is not a multiple of the SD card's block size.
+    // I.e. performance uplift for "write/read more/less than..." test cases - e.g. an equivalent situation to appending to a file.
+    host.unaligned_multi_block_rw_max_chunk_size = CONFIG_EXAMPLE_SD_UNALIGNED_MULTI_BLOCK_RW_MAX_CHUNK_SIZE;

    spi_bus_config_t bus_cfg = {
        .mosi_io_num = CONFIG_EXAMPLE_PIN_MOSI,
@@ -162,6 +162,7 @@ void app_main(void)
    // For setting a specific frequency, use host.max_freq_khz (range 400kHz - 40MHz for SDMMC)
    // Example: for fixed frequency of 10MHz, use host.max_freq_khz = 10000;
    sdmmc_host_t host = SDMMC_HOST_DEFAULT();
+    host.unaligned_multi_block_rw_max_chunk_size = 8;
 #if CONFIG_EXAMPLE_SDMMC_SPEED_HS
    host.max_freq_khz = SDMMC_FREQ_HIGHSPEED;
 #elif CONFIG_EXAMPLE_SDMMC_SPEED_UHS_I_SDR50
@@ -122,6 +122,7 @@ void app_main(void)
    // For setting a specific frequency, use host.max_freq_khz (range 400kHz - 20MHz for SDSPI)
    // Example: for fixed frequency of 10MHz, use host.max_freq_khz = 10000;
    sdmmc_host_t host = SDSPI_HOST_DEFAULT();
+    host.unaligned_multi_block_rw_max_chunk_size = 8;

    // For SoCs where the SD power can be supplied both via an internal or external (e.g. on-board LDO) power supply.
    // When using specific IO pins (which can be used for ultra high-speed SDMMC) to connect to the SD card