Skip to content

Add DWC2 cache maintenance routines for STM32 #2963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from

Conversation

HiFiPhile
Copy link
Collaborator

@HiFiPhile HiFiPhile commented Jan 25, 2025

Describe the PR

  • Support DMA + DCache ON
  • Refactored buffer alignment macro to take into account cache line size

Now #define CFG_TUD_DWC2_DMA_ENABLE 1 is enough.

Rebased on master and cleaned up.

It's prefer to declare a non-cached region with MPU instead of rely on cache invalidate+clean, benchmark on STM32H7S3 and i.MX RT1170 shows frequent cache invalidate+clean really hurts performance.

  • Tested on STM32H7S3:
Device Host
cdc_msc device_info
audio_4_channel_mic
uac2_headset
Benchmark code
extern uint32_t SystemCoreClock;
void SWD_Init(void)
{
  //UNLOCK FUNNEL
  *(volatile uint32_t*)(0x5C004FB0) = 0xC5ACCE55; // SWTF_LAR
  *(volatile uint32_t*)(0x5C003FB0) = 0xC5ACCE55; // SWO_LAR
 
  //SWO current output divisor register
  //This divisor value (0x000000C7) corresponds to 400Mhz
  //To change it, you can use the following rule
  // value = (CPU Freq/sw speed )-1
  blink_interval_ms= *(volatile uint32_t*)(0x5C003010);
   *(volatile uint32_t*)(0x5C003010) = ((SystemCoreClock / 12000000) - 1); // SWO_CODR
 
  //SWO selected pin protocol register
   *(volatile uint32_t*)(0x5C0030F0) = 0x00000002; // SWO_SPPR
 
  //Enable ITM input of SWO trace funnel
   *(volatile uint32_t*)(0x5C004000) |= 0x00000001; // SWFT_CTRL
 
}

static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit);
static void MPU_Config(void)
{
  MPU_Region_InitTypeDef MPU_InitStruct = {0};
  uint32_t index = MPU_REGION_NUMBER0;
  uint32_t address;
  uint32_t size;

  /* Disable the MPU */
  HAL_MPU_Disable();

  /* Initialize the background region */
  MPU_InitStruct.Enable = MPU_REGION_ENABLE;
  MPU_InitStruct.Number = index;
  MPU_InitStruct.BaseAddress = 0x0;
  MPU_InitStruct.Size = MPU_REGION_SIZE_4GB;
  MPU_InitStruct.SubRegionDisable = 0x87;
  MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL0;
  MPU_InitStruct.AccessPermission = MPU_REGION_NO_ACCESS;
  MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
  MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
  MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
  MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
  HAL_MPU_ConfigRegion(&MPU_InitStruct);
  index++;

  /* Initialize the non cacheable region */
#if defined ( __ICCARM__ )
  /* get the region attribute form the icf file */
  extern uint32_t NONCACHEABLEBUFFER_start;
  extern uint32_t NONCACHEABLEBUFFER_size;

  address = (uint32_t)&NONCACHEABLEBUFFER_start;
  size = (uint32_t)&NONCACHEABLEBUFFER_size;

#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Base;
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Length;
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;

  address = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Base;
  size  = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Length + (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;
#elif defined ( __GNUC__ )
  extern int __NONCACHEABLEBUFFER_BEGIN;
  extern int __NONCACHEABLEBUFFER_END;

  address = (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
  size  = (uint32_t)&__NONCACHEABLEBUFFER_END - (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
#else
#error "Compiler toolchain is unsupported"
#endif

  if (size != 0)
  {
    /* Configure the MPU attributes as Normal Non Cacheable */
    MPU_InitStruct.Enable = MPU_REGION_ENABLE;
    MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
    MPU_InitStruct.IsBufferable = MPU_ACCESS_NOT_BUFFERABLE;
    MPU_InitStruct.IsCacheable = MPU_ACCESS_NOT_CACHEABLE;
    MPU_InitStruct.IsShareable = MPU_ACCESS_NOT_SHAREABLE;
    MPU_InitStruct.Number = index;
    MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
    MPU_InitStruct.SubRegionDisable = 0x00;
    MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
    MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
    HAL_MPU_ConfigRegion(&MPU_InitStruct);
    index++;
  }

  /* Initialize the region corresponding to the execution area
     (external or internal flash or external or internal RAM
     depending on scatter file definition) */
#if defined ( __ICCARM__ )
  extern uint32_t __ICFEDIT_region_ROM_start__;
  extern uint32_t __ICFEDIT_region_ROM_end__;
  address = (uint32_t)&__ICFEDIT_region_ROM_start__;
  size = (uint32_t)&__ICFEDIT_region_ROM_end__ - (uint32_t)&__ICFEDIT_region_ROM_start__ + 1;
#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
  extern uint32_t Image$$ER_ROM$$Base;
  extern uint32_t Image$$ER_ROM$$Limit;
  address = (uint32_t)&Image$$ER_ROM$$Base;
  size    = (uint32_t)&Image$$ER_ROM$$Limit-(uint32_t)&Image$$ER_ROM$$Base;
#elif defined ( __GNUC__ )
  extern uint32_t __FLASH_BEGIN;
  extern uint32_t __FLASH_SIZE;
  address = (uint32_t)&__FLASH_BEGIN;
  size  = (uint32_t)&__FLASH_SIZE;
#else
#error "Compiler toolchain is unsupported"
#endif

  MPU_InitStruct.Enable = MPU_REGION_ENABLE;
  MPU_InitStruct.Number = index;
  MPU_InitStruct.SubRegionDisable = 0u;
  MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
  MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
  MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_ENABLE;
  MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
  MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
  MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
  MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
  HAL_MPU_ConfigRegion(&MPU_InitStruct);
  index++;

  /* Reset unused MPU regions */
  for(; index < __MPU_REGIONCOUNT ; index++)
  {
    /* All unused regions disabled */
    MPU_InitStruct.Enable = MPU_REGION_DISABLE;
    MPU_InitStruct.Number = index;
    HAL_MPU_ConfigRegion(&MPU_InitStruct);
  }

  /* Enable the MPU */
  HAL_MPU_Enable(MPU_PRIVILEGED_DEFAULT);
}

/**
  * @brief This function adjusts the MPU region Address and Size within an MPU configuration.
  * @param Address memory address
  * @param Size memory size
  * @param pInit pointer to an MPU initialization structure
  * @retval None
  */
static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit)
{
  /* Compute the MPU region size */
  pInit->Size = ((31 - __CLZ(Size)) - 1);
  if (Size > (1 << (pInit->Size + 1)))
  {
    pInit->Size++;
  }
  uint32_t Modulo = Address % (1 << (pInit->Size - 1));
  if (0 != Modulo)
  {
    /* Align address with MPU region size considering there is no need to increase the size */
    pInit->BaseAddress = Address - Modulo;
  }
  else
  {
    pInit->BaseAddress = Address;
  }
}

#define TEST_SIZE 1024

__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer1[TEST_SIZE];

__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer2[TEST_SIZE];

__attribute__((section("noncacheable")))
__attribute__((aligned(32)))
uint8_t buffer_ncache[TEST_SIZE];

__attribute__((aligned(32)))
uint8_t buffer_cached[TEST_SIZE];

unsigned int test_loop(void* dst, const void* src, int size, bool flush, bool invalidate)
{
    volatile unsigned int *DWT_CYCCNT = (uint32_t *)0xE0001004; //address of the register
    volatile unsigned int *DWT_CONTROL = (uint32_t *)0xE0001000; //address of the register
    volatile unsigned int *SCB_DEMCR = (uint32_t *)0xE000EDFC; //address of the register
    
    *SCB_DEMCR = *SCB_DEMCR | 0x01000000;
    *DWT_CYCCNT = 0;
    *DWT_CONTROL |=  1;
    
    if(invalidate)
        SCB_InvalidateDCache_by_Addr((uint32_t*)src, size);
    
    
    memcpy(dst, src, size);
    

    if (flush)
        SCB_CleanDCache_by_Addr((uint32_t*)dst, size);
    
    *DWT_CONTROL &= ~1;
    return *DWT_CYCCNT;
}

int main(void) {
  MPU_Config();
  board_init();
  SWD_Init();
    
  unsigned int cycle;
    printf("\r\nmemcpy benchmark \r\n");

    for(int i = 0; i < TEST_SIZE; i++)
    {
        buffer1[i] = (uint8_t)i;
    }
    
    printf("DTCM - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer2, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - NonCache\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_ncache, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - Cache\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - Cache+Flush\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, true, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    
    printf("NonCache - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_ncache, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("Cache - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("Cache+Invalidate - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, true);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    
    while (1)
    {
    }
  
}

@HiFiPhile
Copy link
Collaborator Author

Looks like my HIL instance has license issue, I think we can add the env locally.

@pstadelmann
Copy link

pstadelmann commented Jan 27, 2025

Works great for me, both for CDC and UVC. Thanks !

In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

@HiFiPhile
Copy link
Collaborator Author

Works great for me, both for CDC and UVC. Thanks !

In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

Thanks for your test.
It's little strange that you need to define __CORTEX_M, normally stm32h7xx.h includes devcie header eg. stm32h747xx.h which includes core_cm7.h and __CORTEX_M is defined inside. Did I miss something ?

@hathach
Copy link
Owner

hathach commented Jan 28, 2025

Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉

@pstadelmann
Copy link

Works great for me, both for CDC and UVC. Thanks !
In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

Thanks for your test. It's little strange that you need to define __CORTEX_M, normally stm32h7xx.h includes devcie header eg. stm32h747xx.h which includes core_cm7.h and __CORTEX_M is defined inside. Did I miss something ?

I don't think so. I'm using a custom RTOS which relies on its own set of headers, that's why.

@HiFiPhile
Copy link
Collaborator Author

Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉

Happy new year also 🎊

{
__NONCACHEABLEBUFFER_BEGIN = .;/* create symbol for start of section */
KEEP(*(noncacheable))
__NONCACHEABLEBUFFER_END = .; /* create symbol for start of section */

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment is wrong. It should be end of section.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, it's copied from STM32CubeH7RS.

@HiFiPhile HiFiPhile marked this pull request as ready for review June 13, 2025 14:23
@HiFiPhile
Copy link
Collaborator Author

@hathach I just rebased on master and did some cleanup.
CFG_TUD_MEM_ALIGN introduce a breaking change, but without this I don't know how to extract numeric value.

@hathach
Copy link
Owner

hathach commented Jun 13, 2025

@hathach I just rebased on master and did some cleanup. CFG_TUD_MEM_ALIGN introduce a breaking change, but without this I don't know how to extract numeric value.

How about adding the CFG_TUD_MEM_ALIGN_BYTE/SIZE instead, CFG_TUD_MEM_ALIGN if not defined will be defined using that. That way people will have time to migrate (or just leave it if they don't need it). Btw, why you need the numeric value (haven't do review yet).

@HiFiPhile
Copy link
Collaborator Author

HiFiPhile commented Jun 13, 2025

How about adding the CFG_TUD_MEM_ALIGN_BYTE/SIZE instead, CFG_TUD_MEM_ALIGN if not defined will be defined using that.

Yeah it's a good idea. I need to align the buffer correctly if user need a higher alignment than cache line size.

Update: no more need to change CFG_TUD_MEM_ALIGN

@HiFiPhile HiFiPhile force-pushed the stm32_cache branch 4 times, most recently from 21282bb to 15f32f2 Compare June 13, 2025 20:40
@@ -196,6 +197,11 @@ SECTIONS
. = ALIGN(8);
} >DTCM

.dtcm_data :
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we remove this, and use the noncacheable_buffer instead, I like to keep thing closed to stock stm32 linker as possible.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it's possible, MPU config need to be added to BSP to make noncacheable_buffer really non-cached. And I forgot this when I enable DCache for F7 & H7...

Copy link
Owner

@hathach hathach Jul 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait, does the current code configure this section as non-cacheable, while noncacheable_buffer section need extra works, or both of them need to be configured the same way ?

if removing this requires extra works, then we can keep it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dtcm_data is only used by RTT to have a non-cached buffer, DWC2 doesn't relied on this.

I've added SCB_EnableDCache for benchmark only, to keep it simple we can also keep the DCache disabled.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so it is possible to use the default noncacheable_buffer section instead of adding new one. That would allow us to use the stock IAR linker as well ? You may want to pull first, since, I push the fix for the linker issue with clang (need to have memory value before provide, and drop the READONLY keywords)

Copy link
Collaborator Author

@HiFiPhile HiFiPhile Jul 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so it is possible to use the default noncacheable_buffer section instead of adding new one. That would allow us to use the stock IAR linker as well ?

I think so, hope MPU config generated by Cube support both compilers. But the default size of 0x400 is too small for RTT buffer.

@@ -36,39 +36,39 @@
#endif

//------------- Device DCache declaration -------------//
#define TUD_EPBUF_DCACHE_SIZE(_size) (CFG_TUD_MEM_DCACHE_ENABLE ? \
#define TUD_EPBUF_DCACHE_SIZE(_size) (TUD_EPBUF_DCACHE_ALIGNED ? \
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why we need to replace CFG_TUD_MEM_DCACHE_ENABLE by TUD_EPBUF_DCACHE_ALIGNED, DCACHE when enabled is certainly required the memory to be in cache line alginment ?

Copy link
Collaborator Author

@HiFiPhile HiFiPhile Jul 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My idea is to control DCache alignment not only by CFG_TUD_MEM_DCACHE_ENABLE but by the necessity:

  • I set CFG_TUD_MEM_DCACHE_ENABLE_DEFAULT=1 for MCUs with internal cache so CFG_TUD_MEM_DCACHE_ENABLE doesn't need to be set by the user, especially during family migration it could be forgot.

  • For DWC2 DCache alignment is only needed when DMA is enabled, without TUD_EPBUF_DCACHE_ALIGNED it will be aligned by default even not needed.

  • User imposed alignment CFG_TUD_MEM_ALIGN always works since the alignment of TUD_EPBUF_DEF union is determined by the most strict member.

Copy link
Owner

@hathach hathach Jul 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the point to introduce TUD_EPBUF_DCACHE_ALIGNED macro, the existing CFG_TUD_MEM_DCACHE_ENABLE and CFG_TUD_MEM_DCACHE_ENABLE_DEFAULT should be good enough, the default value of dwc2 can be CFG_TUD_DWC2_DMA_ENABLE (same as P4). The setting is rather comprehensive already .

  #define CFG_TUD_MEM_DCACHE_ENABLE_DEFAULT  CFG_TUD_DWC2_DMA_ENABLE
  #define CFG_TUH_MEM_DCACHE_ENABLE_DEFAULT  CFG_TUH_DWC2_DMA_ENABLE

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, let me keep it simple.

@HiFiPhile
Copy link
Collaborator Author

I've updated BSP:

  • Keep DCache disabled for H7 & F7 for RTT compatibility
  • Add MPU_Config to H7RS and use noncacheable_buffer for RTT buffer

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants