Skip to content

Add DWC2 cache maintenance routines for STM32 #2963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 7, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions hw/bsp/stm32f7/family.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ void OTG_HS_IRQHandler(void) {
//--------------------------------------------------------------------+

void board_init(void) {
SCB_EnableICache();
SCB_EnableDCache();

HAL_Init();

board_clock_init();

// Enable All GPIOs clocks
Expand Down
5 changes: 5 additions & 0 deletions hw/bsp/stm32h7/family.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ static void trace_etm_init(void) {
#endif

void board_init(void) {
SCB_EnableICache();
SCB_EnableDCache();

HAL_Init();

// Implemented in board.h
SystemClock_Config();

Expand Down
2 changes: 0 additions & 2 deletions hw/bsp/stm32h7rs/boards/stm32h7s3nucleo/board.cmake
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
set(MCU_VARIANT stm32h7s3xx)
set(JLINK_DEVICE stm32h7s3xx)

set(LD_FILE_GNU ${CMAKE_CURRENT_LIST_DIR}/stm32h7s3xx_flash.ld)
set(LD_FILE_Clang ${LD_FILE_GNU})
set(LD_FILE_IAR ${CMAKE_CURRENT_LIST_DIR}/stm32h7s3xx_flash.icf)

function(update_board TARGET)
target_compile_definitions(${TARGET} PUBLIC
Expand Down
4 changes: 0 additions & 4 deletions hw/bsp/stm32h7rs/boards/stm32h7s3nucleo/board.mk
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@ JLINK_DEVICE = stm32h7s3xx
# flash target using on-board stlink
flash: flash-stlink

# Linker
LD_FILE_GCC = $(BOARD_PATH)/stm32h7s3xx_flash.ld
LD_FILE_IAR = $(BOARD_PATH)/stm32h7s3xx_flash.icf

SRC_C += \
$(ST_TCPP0203)/tcpp0203.c \
$(ST_TCPP0203)/tcpp0203_reg.c \
Expand Down
4 changes: 4 additions & 0 deletions hw/bsp/stm32h7rs/family.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,13 @@ void log_swo_init(void)
#endif

void board_init(void) {
SCB_EnableICache();
SCB_EnableDCache();

HAL_Init();

HAL_PWREx_ConfigSupply(PWR_LDO_SUPPLY);

// Implemented in board.h
SystemClock_Config();

Expand Down
7 changes: 3 additions & 4 deletions hw/bsp/stm32h7rs/family.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ function(add_board_target BOARD_TARGET)
set(STARTUP_FILE_IAR ${ST_CMSIS}/Source/Templates/iar/startup_${MCU_VARIANT}.s)

if(NOT DEFINED LD_FILE_GNU)
set(LD_FILE_GNU ${ST_CMSIS}/Source/Templates/gcc/linker/${MCU_VARIANT}_flash.ld)
set(LD_FILE_GNU ${CMAKE_CURRENT_FUNCTION_LIST_DIR}/linker/${MCU_VARIANT}_flash.ld)
endif()
set(LD_FILE_Clang ${LD_FILE_GNU})
if(NOT DEFINED LD_FILE_IAR)
set(LD_FILE_IAR ${ST_CMSIS}/Source/Templates/iar/linker/${MCU_VARIANT}_flash.icf)
set(LD_FILE_IAR ${CMAKE_CURRENT_FUNCTION_LIST_DIR}/linker/${MCU_VARIANT}_flash.icf)
endif()

add_library(${BOARD_TARGET} STATIC
Expand Down Expand Up @@ -87,8 +87,7 @@ function(add_board_target BOARD_TARGET)
BOARD_TUD_MAX_SPEED=${RHPORT_DEVICE_SPEED}
BOARD_TUH_RHPORT=${RHPORT_HOST}
BOARD_TUH_MAX_SPEED=${RHPORT_HOST_SPEED}
SEGGER_RTT_SECTION="noncacheable_buffer"
BUFFER_SIZE_UP=0x3000
SEGGER_RTT_SECTION=\"dtcm_data\"
)

update_board(${BOARD_TARGET})
Expand Down
7 changes: 3 additions & 4 deletions hw/bsp/stm32h7rs/family.mk
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ CFLAGS += \
-DBOARD_TUD_MAX_SPEED=${RHPORT_DEVICE_SPEED} \
-DBOARD_TUH_RHPORT=${RHPORT_HOST} \
-DBOARD_TUH_MAX_SPEED=${RHPORT_HOST_SPEED} \
-DSEGGER_RTT_SECTION=\"noncacheable_buffer\" \
-DBUFFER_SIZE_UP=0x3000 \
-DSEGGER_RTT_SECTION="dtcm_data" \

# GCC Flags
CFLAGS_GCC += \
Expand Down Expand Up @@ -91,5 +90,5 @@ SRC_S_GCC += $(ST_CMSIS)/Source/Templates/gcc/startup_$(MCU_VARIANT).s
SRC_S_IAR += $(ST_CMSIS)/Source/Templates/iar/startup_$(MCU_VARIANT).s

# Linker
LD_FILE_GCC ?= $(ST_CMSIS)/Source/Templates/gcc/linker/$(MCU_VARIANT)_flash.ld
LD_FILE_IAR ?= $(ST_CMSIS)/Source/Templates/iar/linker/$(MCU_VARIANT)_flash.icf
LD_FILE_GCC ?= $(FAMILY_PATH)/linker/$(MCU_VARIANT)_flash.ld
LD_FILE_IAR ?= $(FAMILY_PATH)/linker/$(MCU_VARIANT)_flash.icf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
/*-Specials-*/
define symbol __ICFEDIT_intvec_start__ = 0x08000000;
/*-Memory Regions-*/
define symbol NONCACHEABLEBUFFER_size = 0x4000;
define symbol NONCACHEABLEBUFFER_size = 0x400;
define symbol __ICFEDIT_region_ROM_start__ = 0x08000000;
define symbol __ICFEDIT_region_ROM_end__ = 0x0800FFFF;
define symbol __ICFEDIT_region_RAM_start__ = 0x24000000;
Expand All @@ -14,7 +14,7 @@ define symbol NONCACHEABLEBUFFER_end = __ICFEDIT_region_RAM_end__ + NONCAC


/*-Sizes-*/
define symbol __ICFEDIT_size_cstack__ = 0x800;
define symbol __ICFEDIT_size_cstack__ = 0x400;
define symbol __ICFEDIT_size_heap__ = 0x200;
/**** End of ICF editor section. ###ICF###*/

Expand Down Expand Up @@ -51,5 +51,5 @@ place at address mem:__ICFEDIT_intvec_start__ { readonly section .intvec };

place in ROM_region { readonly };
place in RAM_region { readwrite };
place in DTCM_region { block CSTACK, block HEAP, section dtcm_data };
place in NONCACHEABLE_region { section noncacheable_buffer };
place in DTCM_region { block CSTACK, block HEAP };
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,19 @@
/* Entry Point */
ENTRY(Reset_Handler)

/* Highest address of the user mode stack */
_estack = ORIGIN(DTCM) + LENGTH(DTCM); /* end of "DTCM" Ram type memory */

_Min_Heap_Size = 0x200; /* required amount of heap */
_Min_Stack_Size = 0x400; /* required amount of stack */

__FLASH_BEGIN = 0x08000000;
__FLASH_SIZE = 0x00010000;


__RAM_BEGIN = 0x24000000;
__RAM_SIZE = 0x4FC00;
__RAM_NONCACHEABLEBUFFER_SIZE = 0x4000;
__RAM_NONCACHEABLEBUFFER_SIZE = 0x400;

/* Memories definition */
MEMORY
Expand All @@ -59,9 +63,6 @@ MEMORY
FLASH (xrw) : ORIGIN = __FLASH_BEGIN, LENGTH = __FLASH_SIZE
}

/* Highest address of the user mode stack */
_estack = ORIGIN(DTCM) + LENGTH(DTCM); /* end of "DTCM" Ram type memory */

/* Sections */
SECTIONS
{
Expand Down Expand Up @@ -99,14 +100,14 @@ SECTIONS
. = ALIGN(4);
} >FLASH

.ARM.extab :
.ARM.extab (READONLY) : /* The READONLY keyword is only supported in GCC11 and later, remove it if using GCC10 or earlier. */
{
. = ALIGN(4);
*(.ARM.extab* .gnu.linkonce.armextab.*)
. = ALIGN(4);
} >FLASH

.ARM :
.ARM (READONLY) : /* The READONLY keyword is only supported in GCC11 and later, remove it if using GCC10 or earlier. */
{
. = ALIGN(4);
__exidx_start = .;
Expand All @@ -115,7 +116,7 @@ SECTIONS
. = ALIGN(4);
} >FLASH

.preinit_array :
.preinit_array (READONLY) : /* The READONLY keyword is only supported in GCC11 and later, remove it if using GCC10 or earlier. */
{
. = ALIGN(4);
PROVIDE_HIDDEN (__preinit_array_start = .);
Expand All @@ -124,7 +125,7 @@ SECTIONS
. = ALIGN(4);
} >FLASH

.init_array :
.init_array (READONLY) : /* The READONLY keyword is only supported in GCC11 and later, remove it if using GCC10 or earlier. */
{
. = ALIGN(4);
PROVIDE_HIDDEN (__init_array_start = .);
Expand All @@ -134,7 +135,7 @@ SECTIONS
. = ALIGN(4);
} >FLASH

.fini_array :
.fini_array (READONLY) : /* The READONLY keyword is only supported in GCC11 and later, remove it if using GCC10 or earlier. */
{
. = ALIGN(4);
PROVIDE_HIDDEN (__fini_array_start = .);
Expand Down Expand Up @@ -182,7 +183,7 @@ SECTIONS
{
__NONCACHEABLEBUFFER_BEGIN = .;/* create symbol for start of section */
KEEP(*(noncacheable_buffer))
__NONCACHEABLEBUFFER_END = .; /* create symbol for start of section */
__NONCACHEABLEBUFFER_END = .; /* create symbol for end of section */
} > RAM_NONCACHEABLEBUFFER

/* User_heap_stack section, used to check that there is enough "DTCM" Ram type memory left */
Expand All @@ -196,6 +197,11 @@ SECTIONS
. = ALIGN(8);
} >DTCM

.dtcm_data :
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we remove this, and use the noncacheable_buffer instead, I like to keep thing closed to stock stm32 linker as possible.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it's possible, MPU config need to be added to BSP to make noncacheable_buffer really non-cached. And I forgot this when I enable DCache for F7 & H7...

Copy link
Owner

@hathach hathach Jul 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait, does the current code configure this section as non-cacheable, while noncacheable_buffer section need extra works, or both of them need to be configured the same way ?

if removing this requires extra works, then we can keep it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dtcm_data is only used by RTT to have a non-cached buffer, DWC2 doesn't relied on this.

I've added SCB_EnableDCache for benchmark only, to keep it simple we can also keep the DCache disabled.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so it is possible to use the default noncacheable_buffer section instead of adding new one. That would allow us to use the stock IAR linker as well ? You may want to pull first, since, I push the fix for the linker issue with clang (need to have memory value before provide, and drop the READONLY keywords)

Copy link
Collaborator Author

@HiFiPhile HiFiPhile Jul 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so it is possible to use the default noncacheable_buffer section instead of adding new one. That would allow us to use the stock IAR linker as well ?

I think so, hope MPU config generated by Cube support both compilers. But the default size of 0x400 is too small for RTT buffer.

{
*(dtcm_data)
} >DTCM

/* Remove information from the compiler libraries */
/DISCARD/ :
{
Expand Down
8 changes: 4 additions & 4 deletions src/class/audio/audio_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,14 @@

// Put swap buffer in USB section only if necessary
#if USE_LINEAR_BUFFER
#define IN_SW_BUF_MEM_ATTR TU_ATTR_ALIGNED(4)
#define IN_SW_BUF_MEM_ATTR
#else
#define IN_SW_BUF_MEM_ATTR CFG_TUD_MEM_SECTION CFG_TUD_MEM_ALIGN
#define IN_SW_BUF_MEM_ATTR CFG_TUD_MEM_SECTION
#endif
#if USE_LINEAR_BUFFER
#define OUT_SW_BUF_MEM_ATTR TU_ATTR_ALIGNED(4)
#define OUT_SW_BUF_MEM_ATTR
#else
#define OUT_SW_BUF_MEM_ATTR CFG_TUD_MEM_SECTION CFG_TUD_MEM_ALIGN
#define OUT_SW_BUF_MEM_ATTR CFG_TUD_MEM_SECTION
#endif

// EP IN software buffers and mutexes
Expand Down
15 changes: 15 additions & 0 deletions src/common/tusb_mcu.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,12 +220,23 @@
#define TUP_RHPORT_HIGHSPEED 1 // Port0: FS, Port1: HS
#endif

#define CFG_TUD_MEM_DCACHE_ENABLE_DEFAULT 1
#define CFG_TUH_MEM_DCACHE_ENABLE_DEFAULT 1
#define CFG_TUSB_MEM_DCACHE_LINE_SIZE 32

#elif TU_CHECK_MCU(OPT_MCU_STM32H7)
#include "stm32h7xx.h"
#define TUP_USBIP_DWC2
#define TUP_USBIP_DWC2_STM32

#define TUP_DCD_ENDPOINT_MAX 9

#if __CORTEX_M == 7
#define CFG_TUD_MEM_DCACHE_ENABLE_DEFAULT 1
#define CFG_TUH_MEM_DCACHE_ENABLE_DEFAULT 1
#define CFG_TUSB_MEM_DCACHE_LINE_SIZE 32
#endif

#elif TU_CHECK_MCU(OPT_MCU_STM32H5)
#define TUP_USBIP_FSDEV
#define TUP_USBIP_FSDEV_STM32
Expand Down Expand Up @@ -322,6 +333,10 @@
// MCU with on-chip HS Phy
#define TUP_RHPORT_HIGHSPEED 1

#define CFG_TUD_MEM_DCACHE_ENABLE_DEFAULT 1
#define CFG_TUH_MEM_DCACHE_ENABLE_DEFAULT 1
#define CFG_TUSB_MEM_DCACHE_LINE_SIZE 32

//--------------------------------------------------------------------+
// Sony
//--------------------------------------------------------------------+
Expand Down
12 changes: 6 additions & 6 deletions src/common/tusb_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,39 +36,39 @@
#endif

//------------- Device DCache declaration -------------//
#define TUD_EPBUF_DCACHE_SIZE(_size) (CFG_TUD_MEM_DCACHE_ENABLE ? \
#define TUD_EPBUF_DCACHE_SIZE(_size) (TUD_EPBUF_DCACHE_ALIGNED ? \
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why we need to replace CFG_TUD_MEM_DCACHE_ENABLE by TUD_EPBUF_DCACHE_ALIGNED, DCACHE when enabled is certainly required the memory to be in cache line alginment ?

Copy link
Collaborator Author

@HiFiPhile HiFiPhile Jul 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My idea is to control DCache alignment not only by CFG_TUD_MEM_DCACHE_ENABLE but by the necessity:

  • I set CFG_TUD_MEM_DCACHE_ENABLE_DEFAULT=1 for MCUs with internal cache so CFG_TUD_MEM_DCACHE_ENABLE doesn't need to be set by the user, especially during family migration it could be forgot.

  • For DWC2 DCache alignment is only needed when DMA is enabled, without TUD_EPBUF_DCACHE_ALIGNED it will be aligned by default even not needed.

  • User imposed alignment CFG_TUD_MEM_ALIGN always works since the alignment of TUD_EPBUF_DEF union is determined by the most strict member.

Copy link
Owner

@hathach hathach Jul 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the point to introduce TUD_EPBUF_DCACHE_ALIGNED macro, the existing CFG_TUD_MEM_DCACHE_ENABLE and CFG_TUD_MEM_DCACHE_ENABLE_DEFAULT should be good enough, the default value of dwc2 can be CFG_TUD_DWC2_DMA_ENABLE (same as P4). The setting is rather comprehensive already .

  #define CFG_TUD_MEM_DCACHE_ENABLE_DEFAULT  CFG_TUD_DWC2_DMA_ENABLE
  #define CFG_TUH_MEM_DCACHE_ENABLE_DEFAULT  CFG_TUH_DWC2_DMA_ENABLE

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, let me keep it simple.

(TU_DIV_CEIL(_size, CFG_TUD_MEM_DCACHE_LINE_SIZE) * CFG_TUD_MEM_DCACHE_LINE_SIZE) : (_size))

// Declare an endpoint buffer with uint8_t[size]
#define TUD_EPBUF_DEF(_name, _size) \
union { \
CFG_TUD_MEM_ALIGN uint8_t _name[_size]; \
uint8_t _name##_dcache_padding[TUD_EPBUF_DCACHE_SIZE(_size)]; \
TU_ATTR_ALIGNED(TUD_EPBUF_DCACHE_ALIGNED ? CFG_TUD_MEM_DCACHE_LINE_SIZE : 1) uint8_t _name##_dcache_padding[TUD_EPBUF_DCACHE_SIZE(_size)]; \
}

// Declare an endpoint buffer with a type
#define TUD_EPBUF_TYPE_DEF(_type, _name) \
union { \
CFG_TUD_MEM_ALIGN _type _name; \
uint8_t _name##_dcache_padding[TUD_EPBUF_DCACHE_SIZE(sizeof(_type))]; \
TU_ATTR_ALIGNED(TUD_EPBUF_DCACHE_ALIGNED ? CFG_TUD_MEM_DCACHE_LINE_SIZE : 1) uint8_t _name##_dcache_padding[TUD_EPBUF_DCACHE_SIZE(sizeof(_type))]; \
}

//------------- Host DCache declaration -------------//
#define TUH_EPBUF_DCACHE_SIZE(_size) (CFG_TUH_MEM_DCACHE_ENABLE ? \
#define TUH_EPBUF_DCACHE_SIZE(_size) (TUH_EPBUF_DCACHE_ALIGNED ? \
(TU_DIV_CEIL(_size, CFG_TUH_MEM_DCACHE_LINE_SIZE) * CFG_TUH_MEM_DCACHE_LINE_SIZE) : (_size))

// Declare an endpoint buffer with uint8_t[size]
#define TUH_EPBUF_DEF(_name, _size) \
union { \
CFG_TUH_MEM_ALIGN uint8_t _name[_size]; \
uint8_t _name##_dcache_padding[TUH_EPBUF_DCACHE_SIZE(_size)]; \
TU_ATTR_ALIGNED(TUH_EPBUF_DCACHE_ALIGNED ? CFG_TUH_MEM_DCACHE_LINE_SIZE : 1) uint8_t _name##_dcache_padding[TUH_EPBUF_DCACHE_SIZE(_size)]; \
}

// Declare an endpoint buffer with a type
#define TUH_EPBUF_TYPE_DEF(_type, _name) \
union { \
CFG_TUH_MEM_ALIGN _type _name; \
uint8_t _name##_dcache_padding[TUH_EPBUF_DCACHE_SIZE(sizeof(_type))]; \
TU_ATTR_ALIGNED(TUH_EPBUF_DCACHE_ALIGNED ? CFG_TUH_MEM_DCACHE_LINE_SIZE : 1) uint8_t _name##_dcache_padding[TUH_EPBUF_DCACHE_SIZE(sizeof(_type))]; \
}


Expand Down
2 changes: 1 addition & 1 deletion src/portable/synopsys/dwc2/dcd_dwc2.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ TU_ATTR_ALWAYS_INLINE static inline uint8_t dwc2_ep_count(const dwc2_regs_t* dwc
//--------------------------------------------------------------------
// DMA
//--------------------------------------------------------------------
#if CFG_TUD_MEM_DCACHE_ENABLE
#if CFG_TUD_MEM_DCACHE_ENABLE && CFG_TUD_DWC2_DMA_ENABLE
bool dcd_dcache_clean(const void* addr, uint32_t data_size) {
TU_VERIFY(addr && data_size);
return dwc2_dcache_clean(addr, data_size);
Expand Down
Loading