Skip to content

Commit b7a2f44

Browse files
authored
Update mmu_get... and mmu_set... (#8290)
These changes are needed to address bugs that can emerge with the improved optimization from the GCC 10.3 compiler. Updated performance inline functions `mmu_get_uint8()`, ... and `mmu_set_uint8()`, ... to comply with strict-aliasing rules. Without this change, stale data may be referenced. This issue was revealed in discussions on #8261 (comment) Changes to avoid over-optimization of 32-bit wide transfers from IRAM, turning into 8-bit or 16-bit transfers by the new GCC 10.3 compiler. This has been a reoccurring/tricky problem for me with the new compiler. So far referencing the 32-bit value loaded by way of an Extended ASM R/W output register has stopped the compiler from optimizing down to an 8-bit or 16-bit transfer. Example: ```cpp uint32_t val; __builtin_memcpy(&val, v32, sizeof(uint32_t)); asm volatile ("" :"+r"(val)); // inject 32-bit dependency ... ``` Updated example `irammem.ino` * do a simple test of compliance to strict-aliasing rules * For `mmu_get_uint8()`, added tests to evaluate if 32-bit wide transfers were converted to an 8-bit transfer.
1 parent 9d024d1 commit b7a2f44

File tree

4 files changed

+306
-42
lines changed

4 files changed

+306
-42
lines changed

cores/esp8266/mmu_iram.h

+90-36
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,24 @@
2626
extern "C" {
2727
#endif
2828

29-
//C This turns on range checking. Is this the value you want to trigger it?
29+
// This turns on range checking.
3030
#ifdef DEBUG_ESP_CORE
3131
#define DEBUG_ESP_MMU
3232
#endif
3333

3434
#if defined(CORE_MOCK)
3535
#define ets_uart_printf(...) do {} while(false)
36+
#define XCHAL_INSTRAM0_VADDR 0x40000000
37+
#define XCHAL_INSTRAM1_VADDR 0x40100000
38+
#define XCHAL_INSTROM0_VADDR 0x40200000
39+
#else
40+
#include <sys/config.h> // For config/core-isa.h
41+
/*
42+
Cautiously use XCHAL_..._VADDR values where possible.
43+
While XCHAL_..._VADDR values in core-isa.h may define the Xtensa processor
44+
CONFIG options, they are not always an indication of DRAM, IRAM, or ROM
45+
size or position in the address space.
46+
*/
3647
#endif
3748

3849
/*
@@ -71,32 +82,34 @@ DBG_MMU_FLUSH(0)
7182

7283
static inline __attribute__((always_inline))
7384
bool mmu_is_iram(const void *addr) {
74-
#define IRAM_START 0x40100000UL
85+
const uintptr_t iram_start = (uintptr_t)XCHAL_INSTRAM1_VADDR;
7586
#ifndef MMU_IRAM_SIZE
7687
#if defined(__GNUC__) && !defined(CORE_MOCK)
7788
#warning "MMU_IRAM_SIZE was undefined, setting to 0x8000UL!"
7889
#endif
79-
#define MMU_IRAM_SIZE 0x8000UL
90+
#define MMU_IRAM_SIZE 0x8000ul
8091
#endif
81-
#define IRAM_END (IRAM_START + MMU_IRAM_SIZE)
92+
const uintptr_t iram_end = iram_start + MMU_IRAM_SIZE;
8293

83-
return (IRAM_START <= (uintptr_t)addr && IRAM_END > (uintptr_t)addr);
94+
return (iram_start <= (uintptr_t)addr && iram_end > (uintptr_t)addr);
8495
}
8596

8697
static inline __attribute__((always_inline))
8798
bool mmu_is_dram(const void *addr) {
88-
#define DRAM_START 0x3FF80000UL
89-
#define DRAM_END 0x40000000UL
99+
const uintptr_t dram_start = 0x3FFE8000ul;
100+
// The start of the Boot ROM sits at the end of DRAM. 0x40000000ul;
101+
const uintptr_t dram_end = (uintptr_t)XCHAL_INSTRAM0_VADDR;
90102

91-
return (DRAM_START <= (uintptr_t)addr && DRAM_END > (uintptr_t)addr);
103+
return (dram_start <= (uintptr_t)addr && dram_end > (uintptr_t)addr);
92104
}
93105

94106
static inline __attribute__((always_inline))
95107
bool mmu_is_icache(const void *addr) {
96-
#define ICACHE_START 0x40200000UL
97-
#define ICACHE_END (ICACHE_START + 0x100000UL)
108+
extern void _irom0_text_end(void);
109+
const uintptr_t icache_start = (uintptr_t)XCHAL_INSTROM0_VADDR;
110+
const uintptr_t icache_end = (uintptr_t)_irom0_text_end;
98111

99-
return (ICACHE_START <= (uintptr_t)addr && ICACHE_END > (uintptr_t)addr);
112+
return (icache_start <= (uintptr_t)addr && icache_end > (uintptr_t)addr);
100113
}
101114

102115
#ifdef DEBUG_ESP_MMU
@@ -127,90 +140,131 @@ bool mmu_is_icache(const void *addr) {
127140
static inline __attribute__((always_inline))
128141
uint8_t mmu_get_uint8(const void *p8) {
129142
ASSERT_RANGE_TEST_READ(p8);
130-
uint32_t val = (*(uint32_t *)((uintptr_t)p8 & ~0x3));
131-
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
143+
// https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#how-do-we-type-pun-correctly
144+
// Comply with strict-aliasing rules. Using memcpy is a Standards suggested
145+
// method for type punning. The compiler optimizer will replace the memcpy
146+
// with an `l32i` instruction. Using __builtin_memcpy to ensure we get the
147+
// effects of the compiler optimization and not some #define version of
148+
// memcpy.
149+
void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
150+
uint32_t val;
151+
__builtin_memcpy(&val, v32, sizeof(uint32_t));
152+
// Use an empty ASM to reference the 32-bit value. This will block the
153+
// compiler from immediately optimizing to an 8-bit or 16-bit load instruction
154+
// against IRAM memory. (This approach was inspired by
155+
// https://github.com/esp8266/Arduino/pull/7780#discussion_r548303374)
156+
// This issue was seen when using a constant address with the GCC 10.3
157+
// compiler.
158+
// As a general practice, I think referencing by way of Extended ASM R/W
159+
// output register will stop the the compiler from reloading the value later
160+
// as 8-bit load from IRAM.
161+
asm volatile ("" :"+r"(val)); // inject 32-bit dependency
162+
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
132163
val >>= pos;
133164
return (uint8_t)val;
134165
}
135166

136167
static inline __attribute__((always_inline))
137168
uint16_t mmu_get_uint16(const uint16_t *p16) {
138169
ASSERT_RANGE_TEST_READ(p16);
139-
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
140-
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
170+
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)0x3u);
171+
uint32_t val;
172+
__builtin_memcpy(&val, v32, sizeof(uint32_t));
173+
asm volatile ("" :"+r"(val));
174+
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
141175
val >>= pos;
142176
return (uint16_t)val;
143177
}
144178

145179
static inline __attribute__((always_inline))
146180
int16_t mmu_get_int16(const int16_t *p16) {
147181
ASSERT_RANGE_TEST_READ(p16);
148-
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
149-
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
182+
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
183+
uint32_t val;
184+
__builtin_memcpy(&val, v32, sizeof(uint32_t));
185+
asm volatile ("" :"+r"(val));
186+
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
150187
val >>= pos;
151188
return (int16_t)val;
152189
}
153190

154191
static inline __attribute__((always_inline))
155192
uint8_t mmu_set_uint8(void *p8, const uint8_t val) {
156193
ASSERT_RANGE_TEST_WRITE(p8);
157-
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
194+
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
158195
uint32_t sval = val << pos;
159-
uint32_t valmask = 0x0FF << pos;
196+
uint32_t valmask = 0x0FFu << pos;
197+
198+
void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
199+
uint32_t ival;
200+
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
201+
asm volatile ("" :"+r"(ival));
160202

161-
uint32_t *p32 = (uint32_t *)((uintptr_t)p8 & ~0x3);
162-
uint32_t ival = *p32;
163203
ival &= (~valmask);
164204
ival |= sval;
165-
*p32 = ival;
205+
/*
206+
This 32-bit dependency injection does not appear to be needed with the
207+
current GCC 10.3; however, that could change in the future versions. Or, I
208+
may not have the right test for it to fail.
209+
*/
210+
asm volatile ("" :"+r"(ival));
211+
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
166212
return val;
167213
}
168214

169215
static inline __attribute__((always_inline))
170216
uint16_t mmu_set_uint16(uint16_t *p16, const uint16_t val) {
171217
ASSERT_RANGE_TEST_WRITE(p16);
172-
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
218+
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
173219
uint32_t sval = val << pos;
174-
uint32_t valmask = 0x0FFFF << pos;
220+
uint32_t valmask = 0x0FFFFu << pos;
221+
222+
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
223+
uint32_t ival;
224+
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
225+
asm volatile ("" :"+r"(ival));
175226

176-
uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
177-
uint32_t ival = *p32;
178227
ival &= (~valmask);
179228
ival |= sval;
180-
*p32 = ival;
229+
asm volatile ("" :"+r"(ival));
230+
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
181231
return val;
182232
}
183233

184234
static inline __attribute__((always_inline))
185235
int16_t mmu_set_int16(int16_t *p16, const int16_t val) {
186236
ASSERT_RANGE_TEST_WRITE(p16);
187237
uint32_t sval = (uint16_t)val;
188-
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
238+
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
189239
sval <<= pos;
190-
uint32_t valmask = 0x0FFFF << pos;
240+
uint32_t valmask = 0x0FFFFu << pos;
241+
242+
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
243+
uint32_t ival;
244+
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
245+
asm volatile ("" :"+r"(ival));
191246

192-
uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
193-
uint32_t ival = *p32;
194247
ival &= (~valmask);
195248
ival |= sval;
196-
*p32 = ival;
249+
asm volatile ("" :"+r"(ival));
250+
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
197251
return val;
198252
}
199253

200254
#if (MMU_IRAM_SIZE > 32*1024) && !defined(MMU_SEC_HEAP)
201-
extern void _text_end(void);
202255
#define MMU_SEC_HEAP mmu_sec_heap()
203256
#define MMU_SEC_HEAP_SIZE mmu_sec_heap_size()
204257

205258
static inline __attribute__((always_inline))
206259
void *mmu_sec_heap(void) {
207-
uint32_t sec_heap = (uint32_t)_text_end + 32;
208-
return (void *)(sec_heap &= ~7);
260+
extern void _text_end(void);
261+
uintptr_t sec_heap = (uintptr_t)_text_end + (uintptr_t)32u;
262+
return (void *)(sec_heap &= ~(uintptr_t)7u);
209263
}
210264

211265
static inline __attribute__((always_inline))
212266
size_t mmu_sec_heap_size(void) {
213-
return (size_t)0xC000UL - ((size_t)mmu_sec_heap() - 0x40100000UL);
267+
return (size_t)0xC000ul - ((uintptr_t)mmu_sec_heap() - (uintptr_t)XCHAL_INSTRAM1_VADDR);
214268
}
215269
#endif
216270

libraries/esp8266/examples/IramReserve/IramReserve.ino

+9-3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
#include <umm_malloc/umm_malloc.h>
1818
#if defined(UMM_HEAP_IRAM)
1919

20+
#if defined(CORE_MOCK)
21+
#define XCHAL_INSTRAM1_VADDR 0x40100000
22+
#else
23+
#include <sys/config.h> // For config/core-isa.h
24+
#endif
25+
2026
// durable - as in long life, persisting across reboots.
2127
struct durable {
2228
uint32_t bootCounter;
@@ -30,7 +36,7 @@ struct durable {
3036
#define IRAM_RESERVE_SZ ((sizeof(struct durable) + 7UL) & ~7UL)
3137

3238
// Position its address just above the reduced 2nd Heap.
33-
#define IRAM_RESERVE (0x40100000UL + 0xC000UL - IRAM_RESERVE_SZ)
39+
#define IRAM_RESERVE ((uintptr_t)XCHAL_INSTRAM1_VADDR + 0xC000UL - IRAM_RESERVE_SZ)
3440

3541
// Define a reference with the right properties to make access easier.
3642
#define DURABLE ((struct durable *)IRAM_RESERVE)
@@ -100,9 +106,9 @@ extern "C" void umm_init_iram(void) {
100106
adjustments and checksums. These can affect the persistence of data across
101107
reboots.
102108
*/
103-
uint32_t sec_heap = (uint32_t)_text_end + 32;
109+
uintptr_t sec_heap = (uintptr_t)_text_end + 32;
104110
sec_heap &= ~7;
105-
size_t sec_heap_sz = 0xC000UL - (sec_heap - 0x40100000UL);
111+
size_t sec_heap_sz = 0xC000UL - (sec_heap - (uintptr_t)XCHAL_INSTRAM1_VADDR);
106112
sec_heap_sz -= IRAM_RESERVE_SZ; // Shrink IRAM heap
107113
if (0xC000UL > sec_heap_sz) {
108114

libraries/esp8266/examples/MMU48K/MMU48K.ino

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
#include <umm_malloc/umm_malloc.h>
44
#include <umm_malloc/umm_heap_select.h>
55

6+
#if defined(CORE_MOCK)
7+
#define XCHAL_INSTRAM1_VADDR 0x40100000
8+
#else
9+
#include <sys/config.h> // For config/core-isa.h
10+
#endif
11+
612
uint32_t timed_byte_read(char *pc, uint32_t * o);
713
uint32_t timed_byte_read2(char *pc, uint32_t * o);
814
int divideA_B(int a, int b);
@@ -102,7 +108,7 @@ void print_mmu_status(Print& oStream) {
102108
#ifdef MMU_IRAM_SIZE
103109
oStream.printf_P(PSTR(" IRAM Size: %u"), MMU_IRAM_SIZE);
104110
oStream.println();
105-
const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - 0x40100000UL);
111+
const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - (uintptr_t)XCHAL_INSTRAM1_VADDR);
106112
oStream.printf_P(PSTR(" IRAM free: %u"), iram_free);
107113
oStream.println();
108114
#endif

0 commit comments

Comments
 (0)