From c7404a3ff89c738c002ec63448a58d56330dfcc3 Mon Sep 17 00:00:00 2001
From: Christian Walther <cwalther@gmx.ch>
Date: Sun, 11 Oct 2020 14:59:33 +0200
Subject: [PATCH 1/9] Add movable allocation system.

This allows calls to `allocate_memory()` while the VM is running, it will then allocate from the GC heap (unless there is a suitable hole among the supervisor allocations), and when the VM exits and the GC heap is freed, the allocation will be moved to the bottom of the former GC heap and transformed into a proper supervisor allocation. Existing movable allocations will also be moved to defragment the supervisor heap and ensure that the next VM run gets as much memory as possible for the GC heap.

By itself this breaks terminalio because it violates the assumption that supervisor_display_move_memory() still has access to an undisturbed heap to copy the tilegrid from. It will work in many cases, but if you're unlucky you will get garbled terminal contents after exiting from the vm run that created the display. This will be fixed in the following commit, which is separate to simplify review.
---
 main.c                                        |  13 +-
 ports/atmel-samd/supervisor/port.c            |   4 +-
 ports/cxd56/supervisor/port.c                 |   8 +-
 ports/esp32s2/supervisor/port.c               |   8 +-
 ports/litex/supervisor/port.c                 |   4 +-
 ports/mimxrt10xx/supervisor/port.c            |   7 +-
 ports/nrf/supervisor/port.c                   |   4 +-
 ports/stm/supervisor/port.c                   |   4 +-
 py/circuitpy_mpconfig.h                       |   4 +
 shared-module/rgbmatrix/RGBMatrix.c           |   2 +-
 .../sharpdisplay/SharpMemoryFramebuffer.c     |   2 +-
 shared-module/usb_midi/__init__.c             |   2 +-
 supervisor/memory.h                           |  26 +-
 supervisor/port.h                             |   5 +-
 supervisor/shared/display.c                   |   4 +-
 .../shared/external_flash/external_flash.c    |   2 +-
 supervisor/shared/memory.c                    | 277 +++++++++++++-----
 supervisor/shared/stack.c                     |  38 ++-
 supervisor/shared/stack.h                     |   6 +-
 19 files changed, 283 insertions(+), 137 deletions(-)

diff --git a/main.c b/main.c
index 80b163f6079f7..1389f524b6b54 100755
--- a/main.c
+++ b/main.c
@@ -123,15 +123,15 @@ void start_mp(supervisor_allocation* heap) {
     // to recover from limit hit.  (Limit is measured in bytes.)
     mp_stack_ctrl_init();
 
-    if (stack_alloc != NULL) {
-        mp_stack_set_limit(stack_alloc->length - 1024);
+    if (stack_get_bottom() != NULL) {
+        mp_stack_set_limit(stack_get_length() - 1024);
     }
 
 
 #if MICROPY_MAX_STACK_USAGE
     // _ezero (same as _ebss) is an int, so start 4 bytes above it.
-    if (stack_alloc != NULL) {
-        mp_stack_set_bottom(stack_alloc->ptr);
+    if (stack_get_bottom() != NULL) {
+        mp_stack_set_bottom(stack_get_bottom());
         mp_stack_fill_with_sentinel();
     }
 #endif
@@ -148,7 +148,7 @@ void start_mp(supervisor_allocation* heap) {
     #endif
 
     #if MICROPY_ENABLE_GC
-    gc_init(heap->ptr, heap->ptr + heap->length / 4);
+    gc_init(heap->ptr, heap->ptr + get_allocation_length(heap) / 4);
     #endif
     mp_init();
     mp_obj_list_init(mp_sys_path, 0);
@@ -451,9 +451,6 @@ int __attribute__((used)) main(void) {
     // initialise the cpu and peripherals
     safe_mode_t safe_mode = port_init();
 
-    // Init memory after the port in case the port needs to set aside memory.
-    memory_init();
-
     // Turn on LEDs
     init_status_leds();
     rgb_led_status_init();
diff --git a/ports/atmel-samd/supervisor/port.c b/ports/atmel-samd/supervisor/port.c
index d65d098257fc7..fc1d1198e2f45 100644
--- a/ports/atmel-samd/supervisor/port.c
+++ b/ports/atmel-samd/supervisor/port.c
@@ -390,8 +390,8 @@ void reset_cpu(void) {
     reset();
 }
 
-supervisor_allocation* port_fixed_stack(void) {
-    return NULL;
+bool port_has_fixed_stack(void) {
+    return false;
 }
 
 uint32_t *port_stack_get_limit(void) {
diff --git a/ports/cxd56/supervisor/port.c b/ports/cxd56/supervisor/port.c
index 086c2d198ec3a..d69f357799a1a 100644
--- a/ports/cxd56/supervisor/port.c
+++ b/ports/cxd56/supervisor/port.c
@@ -98,12 +98,8 @@ void reset_to_bootloader(void) {
     }
 }
 
-supervisor_allocation _fixed_stack;
-
-supervisor_allocation* port_fixed_stack(void) {
-    _fixed_stack.ptr = port_stack_get_limit();
-    _fixed_stack.length = (port_stack_get_top() - port_stack_get_limit()) * sizeof(uint32_t);
-    return &_fixed_stack;
+bool port_has_fixed_stack(void) {
+    return true;
 }
 
 uint32_t *port_stack_get_limit(void) {
diff --git a/ports/esp32s2/supervisor/port.c b/ports/esp32s2/supervisor/port.c
index aff7dbda4dcdd..264bdee9749ab 100644
--- a/ports/esp32s2/supervisor/port.c
+++ b/ports/esp32s2/supervisor/port.c
@@ -193,12 +193,8 @@ uint32_t *port_stack_get_top(void) {
     return port_stack_get_limit() + ESP_TASK_MAIN_STACK / (sizeof(uint32_t) / sizeof(StackType_t));
 }
 
-supervisor_allocation _fixed_stack;
-
-supervisor_allocation* port_fixed_stack(void) {
-    _fixed_stack.ptr = port_stack_get_limit();
-    _fixed_stack.length = (port_stack_get_top() - port_stack_get_limit()) * sizeof(uint32_t);
-    return &_fixed_stack;
+bool port_has_fixed_stack(void) {
+    return true;
 }
 
 // Place the word to save just after our BSS section that gets blanked.
diff --git a/ports/litex/supervisor/port.c b/ports/litex/supervisor/port.c
index 02617b9af7deb..f5c362ea6e6de 100644
--- a/ports/litex/supervisor/port.c
+++ b/ports/litex/supervisor/port.c
@@ -98,8 +98,8 @@ void reset_cpu(void) {
     for(;;) {}
 }
 
-supervisor_allocation* port_fixed_stack(void) {
-    return NULL;
+bool port_has_fixed_stack(void) {
+    return false;
 }
 
 uint32_t *port_heap_get_bottom(void) {
diff --git a/ports/mimxrt10xx/supervisor/port.c b/ports/mimxrt10xx/supervisor/port.c
index e3fef373f8154..1be2b10396c92 100644
--- a/ports/mimxrt10xx/supervisor/port.c
+++ b/ports/mimxrt10xx/supervisor/port.c
@@ -334,11 +334,8 @@ uint32_t *port_stack_get_top(void) {
     return &_ld_stack_top;
 }
 
-supervisor_allocation _fixed_stack;
-supervisor_allocation* port_fixed_stack(void) {
-    _fixed_stack.ptr = port_stack_get_limit();
-    _fixed_stack.length = (port_stack_get_top() - port_stack_get_limit()) * sizeof(uint32_t);
-    return &_fixed_stack;
+bool port_has_fixed_stack(void) {
+    return true;
 }
 
 uint32_t *port_heap_get_bottom(void) {
diff --git a/ports/nrf/supervisor/port.c b/ports/nrf/supervisor/port.c
index 493de43e0f024..5f1c9f1ba9195 100644
--- a/ports/nrf/supervisor/port.c
+++ b/ports/nrf/supervisor/port.c
@@ -251,8 +251,8 @@ uint32_t *port_heap_get_top(void) {
     return port_stack_get_top();
 }
 
-supervisor_allocation* port_fixed_stack(void) {
-    return NULL;
+bool port_has_fixed_stack(void) {
+    return false;
 }
 
 uint32_t *port_stack_get_limit(void) {
diff --git a/ports/stm/supervisor/port.c b/ports/stm/supervisor/port.c
index a8aab00ff25e3..dba1cf21ee983 100644
--- a/ports/stm/supervisor/port.c
+++ b/ports/stm/supervisor/port.c
@@ -267,8 +267,8 @@ uint32_t *port_heap_get_top(void) {
     return &_ld_heap_end;
 }
 
-supervisor_allocation* port_fixed_stack(void) {
-    return NULL;
+bool port_has_fixed_stack(void) {
+    return false;
 }
 
 uint32_t *port_stack_get_limit(void) {
diff --git a/py/circuitpy_mpconfig.h b/py/circuitpy_mpconfig.h
index 28fd4095c4621..28fd6e9b00f17 100644
--- a/py/circuitpy_mpconfig.h
+++ b/py/circuitpy_mpconfig.h
@@ -858,6 +858,9 @@ extern const struct _mp_obj_module_t wifi_module;
 
 #include "supervisor/flash_root_pointers.h"
 
+// From supervisor/memory.c
+struct _supervisor_allocation_node;
+
 #define CIRCUITPY_COMMON_ROOT_POINTERS \
     const char *readline_hist[8]; \
     vstr_t *repl_line; \
@@ -869,6 +872,7 @@ extern const struct _mp_obj_module_t wifi_module;
     FLASH_ROOT_POINTERS \
     MEMORYMONITOR_ROOT_POINTERS \
     NETWORK_ROOT_POINTERS \
+    struct _supervisor_allocation_node* first_embedded_allocation; \
 
 void supervisor_run_background_tasks_if_tick(void);
 #define RUN_BACKGROUND_TASKS (supervisor_run_background_tasks_if_tick())
diff --git a/shared-module/rgbmatrix/RGBMatrix.c b/shared-module/rgbmatrix/RGBMatrix.c
index 94c3eda27f3ae..1f144aedb5f36 100644
--- a/shared-module/rgbmatrix/RGBMatrix.c
+++ b/shared-module/rgbmatrix/RGBMatrix.c
@@ -220,7 +220,7 @@ void *common_hal_rgbmatrix_allocator_impl(size_t sz) {
     if (gc_alloc_possible()) {
         return m_malloc_maybe(sz + sizeof(void*), true);
     } else {
-        supervisor_allocation *allocation = allocate_memory(align32_size(sz), false);
+        supervisor_allocation *allocation = allocate_memory(align32_size(sz), false, false);
         return allocation ? allocation->ptr : NULL;
     }
 }
diff --git a/shared-module/sharpdisplay/SharpMemoryFramebuffer.c b/shared-module/sharpdisplay/SharpMemoryFramebuffer.c
index b199e98d63efc..aefb6b18de490 100644
--- a/shared-module/sharpdisplay/SharpMemoryFramebuffer.c
+++ b/shared-module/sharpdisplay/SharpMemoryFramebuffer.c
@@ -40,7 +40,7 @@
 #define SHARPMEM_BIT_VCOM_LSB (0x40)
 
 static void *hybrid_alloc(size_t sz) {
-    supervisor_allocation *allocation = allocate_memory(align32_size(sz), false);
+    supervisor_allocation *allocation = allocate_memory(align32_size(sz), false, false);
     if (allocation) {
         memset(allocation->ptr, 0, sz);
         return allocation->ptr;
diff --git a/shared-module/usb_midi/__init__.c b/shared-module/usb_midi/__init__.c
index 73a314b9972a3..5afdd18213265 100644
--- a/shared-module/usb_midi/__init__.c
+++ b/shared-module/usb_midi/__init__.c
@@ -45,7 +45,7 @@ void usb_midi_init(void) {
     uint16_t portout_size = align32_size(sizeof(usb_midi_portout_obj_t));
 
     // For each embedded MIDI Jack in the descriptor we create a Port
-    usb_midi_allocation = allocate_memory(tuple_size + portin_size + portout_size, false);
+    usb_midi_allocation = allocate_memory(tuple_size + portin_size + portout_size, false, false);
 
     mp_obj_tuple_t *ports = (mp_obj_tuple_t *) usb_midi_allocation->ptr;
     ports->base.type = &mp_type_tuple;
diff --git a/supervisor/memory.h b/supervisor/memory.h
index f4359ca46ecd8..4307e3f21d556 100755
--- a/supervisor/memory.h
+++ b/supervisor/memory.h
@@ -33,23 +33,36 @@
 
 #include <stdbool.h>
 #include <stdint.h>
+#include <stddef.h>
 
 typedef struct {
     uint32_t* ptr;
-    uint32_t length; // in bytes
 } supervisor_allocation;
 
 
 
-void memory_init(void);
 void free_memory(supervisor_allocation* allocation);
+
+// Find the allocation with the given ptr, NULL if not found. When called from the context of a
+// supervisor_move_memory() callback, finds the allocation that had that ptr *before* the move, but
+// the returned allocation already contains the ptr after the move.
+// When called with NULL, may return either NULL or an unused allocation whose ptr is NULL (this is
+// a feature used internally in allocate_memory to save code size). Passing the return value to
+// free_memory() is a permissible no-op in either case.
 supervisor_allocation* allocation_from_ptr(void *ptr);
+
 supervisor_allocation* allocate_remaining_memory(void);
 
 // Allocate a piece of a given length in bytes. If high_address is true then it should be allocated
 // at a lower address from the top of the stack. Otherwise, addresses will increase starting after
-// statically allocated memory.
-supervisor_allocation* allocate_memory(uint32_t length, bool high_address);
+// statically allocated memory. If movable is false, memory will be taken from outside the GC heap
+// and will stay stationary until freed. While the VM is running, this will fail unless a previous
+// allocation of exactly matching length has recently been freed. If movable is true, memory will be
+// taken from either outside or inside the GC heap, and when the VM exits, will be moved outside.
+// The ptr of the returned supervisor_allocation will change at that point. If you need to be
+// notified of that, add your own callback function at the designated place near the end of
+// supervisor_move_memory().
+supervisor_allocation* allocate_memory(uint32_t length, bool high_address, bool movable);
 
 static inline uint16_t align32_size(uint16_t size) {
     if (size % 4 != 0) {
@@ -58,7 +71,10 @@ static inline uint16_t align32_size(uint16_t size) {
     return size;
 }
 
-// Called after the heap is freed in case the supervisor wants to save some values.
+size_t get_allocation_length(supervisor_allocation* allocation);
+
+// Called after the GC heap is freed, transfers movable allocations from the GC heap to the
+// supervisor heap and compacts the supervisor heap.
 void supervisor_move_memory(void);
 
 #endif  // MICROPY_INCLUDED_SUPERVISOR_MEMORY_H
diff --git a/supervisor/port.h b/supervisor/port.h
index f5b3c15d1412e..5bc06bc4e1413 100644
--- a/supervisor/port.h
+++ b/supervisor/port.h
@@ -61,7 +61,8 @@ uint32_t *port_stack_get_limit(void);
 // Get stack top address
 uint32_t *port_stack_get_top(void);
 
-supervisor_allocation* port_fixed_stack(void);
+// True if stack is not located inside heap (at the top)
+bool port_has_fixed_stack(void);
 
 // Get heap bottom address
 uint32_t *port_heap_get_bottom(void);
@@ -69,8 +70,6 @@ uint32_t *port_heap_get_bottom(void);
 // Get heap top address
 uint32_t *port_heap_get_top(void);
 
-supervisor_allocation* port_fixed_heap(void);
-
 // Save and retrieve a word from memory that is preserved over reset. Used for safe mode.
 void port_set_saved_word(uint32_t);
 uint32_t port_get_saved_word(void);
diff --git a/supervisor/shared/display.c b/supervisor/shared/display.c
index a9ae25884239f..de45e2672f13f 100644
--- a/supervisor/shared/display.c
+++ b/supervisor/shared/display.c
@@ -82,7 +82,7 @@ void supervisor_start_terminal(uint16_t width_px, uint16_t height_px) {
     uint16_t total_tiles = width_in_tiles * height_in_tiles;
 
     // First try to allocate outside the heap. This will fail when the VM is running.
-    tilegrid_tiles = allocate_memory(align32_size(total_tiles), false);
+    tilegrid_tiles = allocate_memory(align32_size(total_tiles), false, false);
     uint8_t* tiles;
     if (tilegrid_tiles == NULL) {
         tiles = m_malloc(total_tiles, true);
@@ -133,7 +133,7 @@ void supervisor_display_move_memory(void) {
         grid->tiles == MP_STATE_VM(terminal_tilegrid_tiles)) {
         uint16_t total_tiles = grid->width_in_tiles * grid->height_in_tiles;
 
-        tilegrid_tiles = allocate_memory(align32_size(total_tiles), false);
+        tilegrid_tiles = allocate_memory(align32_size(total_tiles), false, false);
         if (tilegrid_tiles != NULL) {
             memcpy(tilegrid_tiles->ptr, grid->tiles, total_tiles);
             grid->tiles = (uint8_t*) tilegrid_tiles->ptr;
diff --git a/supervisor/shared/external_flash/external_flash.c b/supervisor/shared/external_flash/external_flash.c
index 5bde7fd4855f5..e2d767235ee73 100644
--- a/supervisor/shared/external_flash/external_flash.c
+++ b/supervisor/shared/external_flash/external_flash.c
@@ -338,7 +338,7 @@ static bool allocate_ram_cache(void) {
 
     uint32_t table_size = blocks_per_sector * pages_per_block * sizeof(uint32_t);
     // Attempt to allocate outside the heap first.
-    supervisor_cache = allocate_memory(table_size + SPI_FLASH_ERASE_SIZE, false);
+    supervisor_cache = allocate_memory(table_size + SPI_FLASH_ERASE_SIZE, false, false);
     if (supervisor_cache != NULL) {
         MP_STATE_VM(flash_ram_cache) = (uint8_t **) supervisor_cache->ptr;
         uint8_t* page_start = (uint8_t *) supervisor_cache->ptr + table_size;
diff --git a/supervisor/shared/memory.c b/supervisor/shared/memory.c
index 0f96ae273409d..2be3b42d63f94 100755
--- a/supervisor/shared/memory.c
+++ b/supervisor/shared/memory.c
@@ -27,78 +27,81 @@
 #include "supervisor/memory.h"
 #include "supervisor/port.h"
 
-#include <stddef.h>
+#include <string.h>
 
+#include "py/gc.h"
 #include "supervisor/shared/display.h"
 
 #define CIRCUITPY_SUPERVISOR_ALLOC_COUNT (12)
 
-// Using a zero length to mark an unused allocation makes the code a bit shorter (but makes it
-// impossible to support zero-length allocations).
-#define FREE 0
-
 // The lowest two bits of a valid length are always zero, so we can use them to mark an allocation
-// as freed by the client but not yet reclaimed into the FREE middle.
+// as a hole (freed by the client but not yet reclaimed into the free middle) and as movable.
+#define FLAGS 3
 #define HOLE 1
+#define MOVABLE 2
 
 static supervisor_allocation allocations[CIRCUITPY_SUPERVISOR_ALLOC_COUNT];
-// We use uint32_t* to ensure word (4 byte) alignment.
-uint32_t* low_address;
-uint32_t* high_address;
+supervisor_allocation* old_allocations;
 
-void memory_init(void) {
-    low_address = port_heap_get_bottom();
-    high_address = port_heap_get_top();
-}
+typedef struct _supervisor_allocation_node {
+    struct _supervisor_allocation_node* next;
+    size_t length;
+    // We use uint32_t to ensure word (4 byte) alignment.
+    uint32_t data[];
+} supervisor_allocation_node;
+
+supervisor_allocation_node* low_head;
+supervisor_allocation_node* high_head;
+
+// Intermediate (void*) is to suppress -Wcast-align warning. Alignment will always be correct
+// because this only reverses how (alloc)->ptr was obtained as &(node->data[0]).
+#define ALLOCATION_NODE(alloc) ((supervisor_allocation_node*)(void*)((char*)((alloc)->ptr) - sizeof(supervisor_allocation_node)))
 
 void free_memory(supervisor_allocation* allocation) {
-    if (allocation == NULL) {
+    if (allocation == NULL || allocation->ptr == NULL) {
         return;
     }
-    int32_t index = 0;
-    bool found = false;
-    for (index = 0; index < CIRCUITPY_SUPERVISOR_ALLOC_COUNT; index++) {
-        found = allocation == &allocations[index];
-        if (found) {
-            break;
-        }
+    supervisor_allocation_node* node = ALLOCATION_NODE(allocation);
+    if (node == low_head) {
+        do {
+            low_head = low_head->next;
+        } while (low_head != NULL && (low_head->length & HOLE));
     }
-    if (!found) {
-        // Bad!
-        // TODO(tannewt): Add a way to escape into safe mode on error.
+    else if (node == high_head) {
+        do {
+            high_head = high_head->next;
+        } while (high_head != NULL && (high_head->length & HOLE));
     }
-    if (allocation->ptr == high_address) {
-        high_address += allocation->length / 4;
-        allocation->length = FREE;
-        for (index++; index < CIRCUITPY_SUPERVISOR_ALLOC_COUNT; index++) {
-            if (!(allocations[index].length & HOLE)) {
-                break;
-            }
-            // Division automatically shifts out the HOLE bit.
-            high_address += allocations[index].length / 4;
-            allocations[index].length = FREE;
-        }
-    } else if (allocation->ptr + allocation->length / 4 == low_address) {
-        low_address = allocation->ptr;
-        allocation->length = FREE;
-        for (index--; index >= 0; index--) {
-            if (!(allocations[index].length & HOLE)) {
-                break;
+    else {
+        // Check if it's in the list of embedded allocations.
+        supervisor_allocation_node** emb = &MP_STATE_VM(first_embedded_allocation);
+        while (*emb != NULL) {
+            if (*emb == node) {
+                // Found, remove it from the list.
+                *emb = node->next;
+                m_free(node
+#if MICROPY_MALLOC_USES_ALLOCATED_SIZE
+                    , sizeof(supervisor_allocation_node) + (node->length & ~FLAGS)
+#endif
+                );
+                goto done;
             }
-            low_address -= allocations[index].length / 4;
-            allocations[index].length = FREE;
+            emb = &((*emb)->next);
         }
-    } else {
-        // Freed memory isn't in the middle so skip updating bounds. The memory will be added to the
-        // middle when the memory to the inside is freed. We still need its length, but setting
-        // only the lowest bit is nondestructive.
-        allocation->length |= HOLE;
+        // Else it must be within the low or high ranges and becomes a hole.
+        node->length = ((node->length & ~FLAGS) | HOLE);
     }
+done:
+    allocation->ptr = NULL;
 }
 
 supervisor_allocation* allocation_from_ptr(void *ptr) {
+    // When called from the context of supervisor_move_memory() (old_allocations != NULL), search
+    // by old pointer to give clients a way of mapping from old to new pointer. But not if
+    // ptr == NULL, then the caller wants an allocation whose current ptr is NULL.
+    supervisor_allocation* list = (old_allocations && ptr) ? old_allocations : &allocations[0];
     for (size_t index = 0; index < CIRCUITPY_SUPERVISOR_ALLOC_COUNT; index++) {
-        if (allocations[index].ptr == ptr) {
+        if (list[index].ptr == ptr) {
             return &allocations[index];
         }
     }
@@ -106,50 +109,172 @@ supervisor_allocation* allocation_from_ptr(void *ptr) {
 }
 
 supervisor_allocation* allocate_remaining_memory(void) {
-    if (low_address == high_address) {
-        return NULL;
+    uint32_t* low_address = low_head ? low_head->data + low_head->length / 4 : port_heap_get_bottom();
+    uint32_t* high_address = high_head ? (uint32_t*)high_head : port_heap_get_top();
+    return allocate_memory((high_address - low_address) * 4 - sizeof(supervisor_allocation_node), false, false);
+}
+
+static supervisor_allocation_node* find_hole(supervisor_allocation_node* node, size_t length) {
+    for (; node != NULL; node = node->next) {
+        if (node->length == (length | HOLE)) {
+            break;
+        }
     }
-    return allocate_memory((high_address - low_address) * 4, false);
+    return node;
 }
 
-supervisor_allocation* allocate_memory(uint32_t length, bool high) {
+static supervisor_allocation_node* allocate_memory_node(uint32_t length, bool high, bool movable) {
+    // supervisor_move_memory() currently does not support movable allocations on the high side, it
+    // must be extended first if this is ever needed.
+    assert(!(high && movable));
     if (length == 0 || length % 4 != 0) {
         return NULL;
     }
-    uint8_t index = 0;
-    int8_t direction = 1;
-    if (high) {
-        index = CIRCUITPY_SUPERVISOR_ALLOC_COUNT - 1;
-        direction = -1;
-    }
-    supervisor_allocation* alloc;
-    for (; index < CIRCUITPY_SUPERVISOR_ALLOC_COUNT; index += direction) {
-        alloc = &allocations[index];
-        if (alloc->length == FREE && (high_address - low_address) * 4 >= (int32_t) length) {
-            break;
+    // 1. Matching hole on the requested side?
+    supervisor_allocation_node* node = find_hole(high ? high_head : low_head, length);
+    if (!node) {
+        // 2. Enough free space in the middle?
+        uint32_t* low_address = low_head ? low_head->data + low_head->length / 4 : port_heap_get_bottom();
+        uint32_t* high_address = high_head ? (uint32_t*)high_head : port_heap_get_top();
+        if ((high_address - low_address) * 4 >= (int32_t)(sizeof(supervisor_allocation_node) + length)) {
+            if (high) {
+                high_address -= (sizeof(supervisor_allocation_node) + length) / 4;
+                node = (supervisor_allocation_node*)high_address;
+                node->next = high_head;
+                high_head = node;
+            }
+            else {
+                node = (supervisor_allocation_node*)low_address;
+                node->next = low_head;
+                low_head = node;
+            }
         }
-        // If a hole matches in length exactly, we can reuse it.
-        if (alloc->length == (length | HOLE)) {
-            alloc->length = length;
-            return alloc;
+        else {
+            // 3. Matching hole on the other side?
+            node = find_hole(high ? low_head : high_head, length);
+            if (!node) {
+                // 4. GC allocation?
+                if (movable && gc_alloc_possible()) {
+                    node = m_malloc_maybe(sizeof(supervisor_allocation_node) + length, true);
+                    if (node) {
+                        node->next = MP_STATE_VM(first_embedded_allocation);
+                        MP_STATE_VM(first_embedded_allocation) = node;
+                    }
+                }
+                if (!node) {
+                    // 5. Give up.
+                    return NULL;
+                }
+            }
         }
     }
-    if (index >= CIRCUITPY_SUPERVISOR_ALLOC_COUNT) {
+    node->length = length;
+    if (movable) {
+        node->length |= MOVABLE;
+    }
+    return node;
+}
+
+supervisor_allocation* allocate_memory(uint32_t length, bool high, bool movable) {
+    supervisor_allocation_node* node = allocate_memory_node(length, high, movable);
+    if (!node) {
         return NULL;
     }
-    if (high) {
-        high_address -= length / 4;
-        alloc->ptr = high_address;
-    } else {
-        alloc->ptr = low_address;
-        low_address += length / 4;
+    // Find the first free allocation.
+    supervisor_allocation* alloc = allocation_from_ptr(NULL);
+    if (!alloc) {
+        // We should free node again to avoid leaking, but something is wrong anyway if clients try
+        // to make more allocations than available, so don't bother.
+        return NULL;
     }
-    alloc->length = length;
+    alloc->ptr = &(node->data[0]);
     return alloc;
 }
 
+size_t get_allocation_length(supervisor_allocation* allocation) {
+    return ALLOCATION_NODE(allocation)->length & ~FLAGS;
+}
+
 void supervisor_move_memory(void) {
+    // This must be called exactly after freeing the heap, so that the embedded allocations, if any,
+    // are now in the free region.
+    assert(MP_STATE_VM(first_embedded_allocation) == NULL || (low_head < MP_STATE_VM(first_embedded_allocation) && MP_STATE_VM(first_embedded_allocation) < high_head));
+
+    // Save the old pointers for allocation_from_ptr().
+    supervisor_allocation old_allocations_array[CIRCUITPY_SUPERVISOR_ALLOC_COUNT];
+    memcpy(old_allocations_array, allocations, sizeof(allocations));
+
+    // Compact the low side. Traverse the list repeatedly, finding movable allocations preceded by a
+    // hole and swapping them, until no more are found. This is not the most runtime-efficient way,
+    // but probably the shortest and simplest code.
+    bool acted;
+    do {
+        acted = false;
+        supervisor_allocation_node** nodep = &low_head;
+        while (*nodep != NULL && (*nodep)->next != NULL) {
+            if (((*nodep)->length & MOVABLE) && ((*nodep)->next->length & HOLE)) {
+                supervisor_allocation_node* oldnode = *nodep;
+                supervisor_allocation_node* start = oldnode->next;
+                supervisor_allocation* alloc = allocation_from_ptr(&(oldnode->data[0]));
+                assert(alloc != NULL);
+                alloc->ptr = &(start->data[0]);
+                oldnode->next = start->next;
+                size_t holelength = start->length;
+                size_t size = sizeof(supervisor_allocation_node) + (oldnode->length & ~FLAGS);
+                memmove(start, oldnode, size);
+                supervisor_allocation_node* newhole = (supervisor_allocation_node*)(void*)((char*)start + size);
+                newhole->next = start;
+                newhole->length = holelength;
+                *nodep = newhole;
+                acted = true;
+            }
+            nodep = &((*nodep)->next);
+        }
+    } while (acted);
+    // Any holes bubbled to the top can be absorbed into the free middle.
+    while (low_head != NULL && (low_head->length & HOLE)) {
+        low_head = low_head->next;
+    };
+
+    // Don't bother compacting the high side, there are no movable allocations and no holes there in
+    // current usage.
+
+    // Promote the embedded allocations to top-level ones, compacting them at the beginning of the
+    // now free region (or possibly in matching holes).
+    // The linked list is unordered, but allocations must be processed in order to avoid risking
+    // overwriting each other. To that end, repeatedly find the lowest element of the list, remove
+    // it from the list, and process it. This ad-hoc selection sort results in substantially shorter
+    // code than using the qsort() function from the C library.
+    while (MP_STATE_VM(first_embedded_allocation)) {
+        // First element is first candidate.
+        supervisor_allocation_node** pminnode = &MP_STATE_VM(first_embedded_allocation);
+        // Iterate from second element (if any) on.
+        for (supervisor_allocation_node** pnode = &(MP_STATE_VM(first_embedded_allocation)->next); *pnode != NULL; pnode = &(*pnode)->next) {
+            if (*pnode < *pminnode) {
+                pminnode = pnode;
+            }
+        }
+        // Remove from list.
+        supervisor_allocation_node* node = *pminnode;
+        *pminnode = node->next;
+        // Process.
+        size_t length = (node->length & ~FLAGS);
+        supervisor_allocation* alloc = allocation_from_ptr(&(node->data[0]));
+        assert(alloc != NULL);
+        // This may overwrite the header of node if it happened to be there already, but not the
+        // data.
+        supervisor_allocation_node* new_node = allocate_memory_node(length, false, true);
+        // There must be enough free space.
+        assert(new_node != NULL);
+        memmove(&(new_node->data[0]), &(node->data[0]), length);
+        alloc->ptr = &(new_node->data[0]);
+    }
+
+    // Notify clients that their movable allocations may have moved.
+    old_allocations = &old_allocations_array[0];
     #if CIRCUITPY_DISPLAYIO
     supervisor_display_move_memory();
     #endif
+    // Add calls to further clients here.
+    old_allocations = NULL;
 }
diff --git a/supervisor/shared/stack.c b/supervisor/shared/stack.c
index e7aa956b01612..afea204010d7e 100755
--- a/supervisor/shared/stack.c
+++ b/supervisor/shared/stack.c
@@ -34,36 +34,42 @@
 
 extern uint32_t _estack;
 
+// Requested size.
 static uint32_t next_stack_size = CIRCUITPY_DEFAULT_STACK_SIZE;
 static uint32_t current_stack_size = 0;
-supervisor_allocation* stack_alloc = NULL;
+// Actual location and size, may be larger than requested.
+static uint32_t* stack_limit = NULL;
+static size_t stack_length = 0;
 
 #define EXCEPTION_STACK_SIZE 1024
 
 void allocate_stack(void) {
 
-    if (port_fixed_stack() != NULL) {
-        stack_alloc = port_fixed_stack();
-        current_stack_size = stack_alloc->length;
+    if (port_has_fixed_stack()) {
+        stack_limit = port_stack_get_limit();
+        stack_length = (port_stack_get_top() - stack_limit)*sizeof(uint32_t);
+        current_stack_size = stack_length;
     } else {
         mp_uint_t regs[10];
         mp_uint_t sp = cpu_get_regs_and_sp(regs);
 
         mp_uint_t c_size = (uint32_t) port_stack_get_top() - sp;
-        stack_alloc = allocate_memory(c_size + next_stack_size + EXCEPTION_STACK_SIZE, true);
+        supervisor_allocation* stack_alloc = allocate_memory(c_size + next_stack_size + EXCEPTION_STACK_SIZE, true, false);
         if (stack_alloc == NULL) {
-            stack_alloc = allocate_memory(c_size + CIRCUITPY_DEFAULT_STACK_SIZE + EXCEPTION_STACK_SIZE, true);
+            stack_alloc = allocate_memory(c_size + CIRCUITPY_DEFAULT_STACK_SIZE + EXCEPTION_STACK_SIZE, true, false);
             current_stack_size = CIRCUITPY_DEFAULT_STACK_SIZE;
         } else {
             current_stack_size = next_stack_size;
         }
+        stack_limit = stack_alloc->ptr;
+        stack_length = get_allocation_length(stack_alloc);
     }
 
-    *stack_alloc->ptr = STACK_CANARY_VALUE;
+    *stack_limit = STACK_CANARY_VALUE;
 }
 
 inline bool stack_ok(void) {
-    return stack_alloc == NULL || *stack_alloc->ptr == STACK_CANARY_VALUE;
+    return stack_limit == NULL || *stack_limit == STACK_CANARY_VALUE;
 }
 
 inline void assert_heap_ok(void) {
@@ -77,18 +83,26 @@ void stack_init(void) {
 }
 
 void stack_resize(void) {
-    if (stack_alloc == NULL) {
+    if (stack_limit == NULL) {
         return;
     }
     if (next_stack_size == current_stack_size) {
-        *stack_alloc->ptr = STACK_CANARY_VALUE;
+        *stack_limit = STACK_CANARY_VALUE;
         return;
     }
-    free_memory(stack_alloc);
-    stack_alloc = NULL;
+    free_memory(allocation_from_ptr(stack_limit));
+    stack_limit = NULL;
     allocate_stack();
 }
 
+uint32_t* stack_get_bottom(void) {
+    return stack_limit;
+}
+
+size_t stack_get_length(void) {
+    return stack_length;
+}
+
 void set_next_stack_size(uint32_t size) {
     next_stack_size = size;
 }
diff --git a/supervisor/shared/stack.h b/supervisor/shared/stack.h
index 7096f0b3edaf4..1c75de5f78ba0 100755
--- a/supervisor/shared/stack.h
+++ b/supervisor/shared/stack.h
@@ -31,10 +31,12 @@
 
 #include "supervisor/memory.h"
 
-extern supervisor_allocation* stack_alloc;
-
 void stack_init(void);
 void stack_resize(void);
+// Actual stack location and size, may be larger than requested.
+uint32_t* stack_get_bottom(void);
+size_t stack_get_length(void);
+// Next/current requested stack size.
 void set_next_stack_size(uint32_t size);
 uint32_t get_current_stack_size(void);
 bool stack_ok(void);

From 2ba9805f845e000fa3299e95e93a063471751ecd Mon Sep 17 00:00:00 2001
From: Christian Walther <cwalther@gmx.ch>
Date: Sun, 11 Oct 2020 20:39:19 +0200
Subject: [PATCH 2/9] Use movable allocation system for terminal tilegrid.

Moving memory is now done by the infrastructure and neither necessary nor correct here anymore.
---
 py/circuitpy_mpconfig.h     |  1 -
 supervisor/shared/display.c | 43 +++++++++++++++----------------------
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/py/circuitpy_mpconfig.h b/py/circuitpy_mpconfig.h
index 28fd6e9b00f17..34ea9b022fa2d 100644
--- a/py/circuitpy_mpconfig.h
+++ b/py/circuitpy_mpconfig.h
@@ -867,7 +867,6 @@ struct _supervisor_allocation_node;
     mp_obj_t rtc_time_source; \
     GAMEPAD_ROOT_POINTERS \
     mp_obj_t pew_singleton; \
-    mp_obj_t terminal_tilegrid_tiles; \
     BOARD_UART_ROOT_POINTER \
     FLASH_ROOT_POINTERS \
     MEMORYMONITOR_ROOT_POINTERS \
diff --git a/supervisor/shared/display.c b/supervisor/shared/display.c
index de45e2672f13f..9c9c66cd7fb69 100644
--- a/supervisor/shared/display.c
+++ b/supervisor/shared/display.c
@@ -81,19 +81,21 @@ void supervisor_start_terminal(uint16_t width_px, uint16_t height_px) {
 
     uint16_t total_tiles = width_in_tiles * height_in_tiles;
 
-    // First try to allocate outside the heap. This will fail when the VM is running.
-    tilegrid_tiles = allocate_memory(align32_size(total_tiles), false, false);
-    uint8_t* tiles;
-    if (tilegrid_tiles == NULL) {
-        tiles = m_malloc(total_tiles, true);
-        MP_STATE_VM(terminal_tilegrid_tiles) = tiles;
-    } else {
-        tiles = (uint8_t*) tilegrid_tiles->ptr;
+    // Reuse the previous allocation if possible
+    if (tilegrid_tiles) {
+        if (get_allocation_length(tilegrid_tiles) != align32_size(total_tiles)) {
+            free_memory(tilegrid_tiles);
+            tilegrid_tiles = NULL;
+        }
     }
-
-    if (tiles == NULL) {
-        return;
+    if (!tilegrid_tiles) {
+        tilegrid_tiles = allocate_memory(align32_size(total_tiles), false, true);
+        if (!tilegrid_tiles) {
+            return;
+        }
     }
+    uint8_t* tiles = (uint8_t*) tilegrid_tiles->ptr;
+
     grid->y = tall ? blinka_bitmap.height : 0;
     grid->x = tall ? 0 : blinka_bitmap.width;
     grid->top_left_y = 0;
@@ -120,7 +122,6 @@ void supervisor_stop_terminal(void) {
     if (tilegrid_tiles != NULL) {
         free_memory(tilegrid_tiles);
         tilegrid_tiles = NULL;
-        supervisor_terminal_text_grid.inline_tiles = false;
         supervisor_terminal_text_grid.tiles = NULL;
     }
     #endif
@@ -128,20 +129,10 @@ void supervisor_stop_terminal(void) {
 
 void supervisor_display_move_memory(void) {
     #if CIRCUITPY_TERMINALIO
-    displayio_tilegrid_t* grid = &supervisor_terminal_text_grid;
-    if (MP_STATE_VM(terminal_tilegrid_tiles) != NULL &&
-        grid->tiles == MP_STATE_VM(terminal_tilegrid_tiles)) {
-        uint16_t total_tiles = grid->width_in_tiles * grid->height_in_tiles;
-
-        tilegrid_tiles = allocate_memory(align32_size(total_tiles), false, false);
-        if (tilegrid_tiles != NULL) {
-            memcpy(tilegrid_tiles->ptr, grid->tiles, total_tiles);
-            grid->tiles = (uint8_t*) tilegrid_tiles->ptr;
-        } else {
-            grid->tiles = NULL;
-            grid->inline_tiles = false;
-        }
-        MP_STATE_VM(terminal_tilegrid_tiles) = NULL;
+    if (tilegrid_tiles != NULL) {
+        supervisor_terminal_text_grid.tiles = (uint8_t*) tilegrid_tiles->ptr;
+    } else {
+        supervisor_terminal_text_grid.tiles = NULL;
     }
     #endif
 

From ac91220361e402bbf6ebb741a9b6ea258a05da99 Mon Sep 17 00:00:00 2001
From: Christian Walther <cwalther@gmx.ch>
Date: Fri, 16 Oct 2020 23:08:29 +0200
Subject: [PATCH 3/9] Use movable allocation system for Sharp display
 framebuffer.

Hybrid allocation is now part of the infrastructure. Moving memory contents would not be necessary because displayio can recreate them, but does not hurt.
---
 .../sharpdisplay/SharpMemoryFramebuffer.c     | 57 +++++--------------
 .../sharpdisplay/SharpMemoryFramebuffer.h     |  1 -
 2 files changed, 14 insertions(+), 44 deletions(-)

diff --git a/shared-module/sharpdisplay/SharpMemoryFramebuffer.c b/shared-module/sharpdisplay/SharpMemoryFramebuffer.c
index aefb6b18de490..4b92bd637ac96 100644
--- a/shared-module/sharpdisplay/SharpMemoryFramebuffer.c
+++ b/shared-module/sharpdisplay/SharpMemoryFramebuffer.c
@@ -34,32 +34,10 @@
 #include "shared-module/sharpdisplay/SharpMemoryFramebuffer.h"
 
 #include "supervisor/memory.h"
-#include "supervisor/shared/safe_mode.h"
 
 #define SHARPMEM_BIT_WRITECMD_LSB (0x80)
 #define SHARPMEM_BIT_VCOM_LSB (0x40)
 
-static void *hybrid_alloc(size_t sz) {
-    supervisor_allocation *allocation = allocate_memory(align32_size(sz), false, false);
-    if (allocation) {
-        memset(allocation->ptr, 0, sz);
-        return allocation->ptr;
-    }
-    if (gc_alloc_possible()) {
-        return m_malloc(sz, true);
-    }
-    reset_into_safe_mode(MEM_MANAGE);
-    return NULL; // unreached
-}
-
-static inline void hybrid_free(void *ptr_in) {
-    supervisor_allocation *allocation = allocation_from_ptr(ptr_in);
-
-    if (allocation) {
-        free_memory(allocation);
-    }
-}
-
 STATIC uint8_t bitrev(uint8_t n) {
     uint8_t r = 0;
     for(int i=0;i<8;i++) r |= ((n>>i) & 1)<<(7-i);
@@ -102,9 +80,9 @@ void common_hal_sharpdisplay_framebuffer_reset(sharpdisplay_framebuffer_obj_t *s
 }
 
 void common_hal_sharpdisplay_framebuffer_reconstruct(sharpdisplay_framebuffer_obj_t *self) {
-    if (!allocation_from_ptr(self->bufinfo.buf)) {
-        self->bufinfo.buf = NULL;
-    }
+    // Look up the allocation by the old pointer and get the new pointer from it.
+    supervisor_allocation* alloc = allocation_from_ptr(self->bufinfo.buf);
+    self->bufinfo.buf = alloc ? alloc->ptr : NULL;
 }
 
 void common_hal_sharpdisplay_framebuffer_get_bufinfo(sharpdisplay_framebuffer_obj_t *self, mp_buffer_info_t *bufinfo) {
@@ -112,7 +90,12 @@ void common_hal_sharpdisplay_framebuffer_get_bufinfo(sharpdisplay_framebuffer_ob
         int row_stride = common_hal_sharpdisplay_framebuffer_get_row_stride(self);
         int height = common_hal_sharpdisplay_framebuffer_get_height(self);
         self->bufinfo.len = row_stride * height + 2;
-        self->bufinfo.buf = hybrid_alloc(self->bufinfo.len);
+        supervisor_allocation* alloc = allocate_memory(align32_size(self->bufinfo.len), false, true);
+        if (alloc == NULL) {
+            m_malloc_fail(self->bufinfo.len);
+        }
+        self->bufinfo.buf = alloc->ptr;
+        memset(alloc->ptr, 0, self->bufinfo.len);
 
         uint8_t *data = self->bufinfo.buf;
         *data++ = SHARPMEM_BIT_WRITECMD_LSB;
@@ -123,7 +106,9 @@ void common_hal_sharpdisplay_framebuffer_get_bufinfo(sharpdisplay_framebuffer_ob
         }
         self->full_refresh = true;
     }
-    *bufinfo = self->bufinfo;
+    if (bufinfo) {
+        *bufinfo = self->bufinfo;
+    }
 }
 
 void common_hal_sharpdisplay_framebuffer_deinit(sharpdisplay_framebuffer_obj_t *self) {
@@ -137,7 +122,7 @@ void common_hal_sharpdisplay_framebuffer_deinit(sharpdisplay_framebuffer_obj_t *
 
     common_hal_reset_pin(self->chip_select.pin);
 
-    hybrid_free(self->bufinfo.buf);
+    free_memory(allocation_from_ptr(self->bufinfo.buf));
 
     memset(self, 0, sizeof(*self));
 }
@@ -154,19 +139,7 @@ void common_hal_sharpdisplay_framebuffer_construct(sharpdisplay_framebuffer_obj_
     self->height = height;
     self->baudrate = baudrate;
 
-    int row_stride = common_hal_sharpdisplay_framebuffer_get_row_stride(self);
-    self->bufinfo.len = row_stride * height + 2;
-    // re-use a supervisor allocation if possible
-    self->bufinfo.buf = hybrid_alloc(self->bufinfo.len);
-
-    uint8_t *data = self->bufinfo.buf;
-    *data++ = SHARPMEM_BIT_WRITECMD_LSB;
-
-    for(int y=0; y<self->height; y++) {
-        *data = bitrev(y+1);
-        data += row_stride;
-    }
-    self->full_refresh = true;
+    common_hal_sharpdisplay_framebuffer_get_bufinfo(self, NULL);
 }
 
 void common_hal_sharpdisplay_framebuffer_swapbuffers(sharpdisplay_framebuffer_obj_t *self, uint8_t *dirty_row_bitmask) {
@@ -271,7 +244,5 @@ const framebuffer_p_t sharpdisplay_framebuffer_proto = {
 };
 
 void common_hal_sharpdisplay_framebuffer_collect_ptrs(sharpdisplay_framebuffer_obj_t *self) {
-    gc_collect_ptr(self->framebuffer);
     gc_collect_ptr(self->bus);
-    gc_collect_ptr(self->bufinfo.buf);
 }
diff --git a/shared-module/sharpdisplay/SharpMemoryFramebuffer.h b/shared-module/sharpdisplay/SharpMemoryFramebuffer.h
index 8acacc94e1122..08966a89c1f35 100644
--- a/shared-module/sharpdisplay/SharpMemoryFramebuffer.h
+++ b/shared-module/sharpdisplay/SharpMemoryFramebuffer.h
@@ -33,7 +33,6 @@
 
 typedef struct {
     mp_obj_base_t base;
-    mp_obj_t framebuffer;
     busio_spi_obj_t* bus;
     busio_spi_obj_t inline_bus;
     digitalio_digitalinout_obj_t chip_select;

From a4b84cf0e118f06cc3b563866d5c594b94b9b3fb Mon Sep 17 00:00:00 2001
From: Christian Walther <cwalther@gmx.ch>
Date: Sat, 17 Oct 2020 13:49:10 +0200
Subject: [PATCH 4/9] Use movable allocation system for RGBMatrix allocations.

Hybrid allocation is now part of the infrastructure. Moving memory contents would not be necessary because displayio can recreate them, but does not hurt.
---
 shared-module/rgbmatrix/RGBMatrix.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/shared-module/rgbmatrix/RGBMatrix.c b/shared-module/rgbmatrix/RGBMatrix.c
index 1f144aedb5f36..a09767b62267e 100644
--- a/shared-module/rgbmatrix/RGBMatrix.c
+++ b/shared-module/rgbmatrix/RGBMatrix.c
@@ -78,10 +78,10 @@ void common_hal_rgbmatrix_rgbmatrix_reconstruct(rgbmatrix_rgbmatrix_obj_t* self,
         // verify that the matrix is big enough
         mp_get_index(mp_obj_get_type(self->framebuffer), self->bufinfo.len, MP_OBJ_NEW_SMALL_INT(self->bufsize-1), false);
     } else {
-        _PM_free(self->bufinfo.buf);
-        _PM_free(self->protomatter.rgbPins);
-        _PM_free(self->protomatter.addr);
-        _PM_free(self->protomatter.screenData);
+        common_hal_rgbmatrix_free_impl(self->bufinfo.buf);
+        common_hal_rgbmatrix_free_impl(self->protomatter.rgbPins);
+        common_hal_rgbmatrix_free_impl(self->protomatter.addr);
+        common_hal_rgbmatrix_free_impl(self->protomatter.screenData);
 
         self->framebuffer = NULL;
         self->bufinfo.buf = common_hal_rgbmatrix_allocator_impl(self->bufsize);
@@ -180,9 +180,6 @@ void common_hal_rgbmatrix_rgbmatrix_deinit(rgbmatrix_rgbmatrix_obj_t* self) {
 
 void rgbmatrix_rgbmatrix_collect_ptrs(rgbmatrix_rgbmatrix_obj_t* self) {
     gc_collect_ptr(self->framebuffer);
-    gc_collect_ptr(self->protomatter.rgbPins);
-    gc_collect_ptr(self->protomatter.addr);
-    gc_collect_ptr(self->protomatter.screenData);
 }
 
 void common_hal_rgbmatrix_rgbmatrix_set_paused(rgbmatrix_rgbmatrix_obj_t* self, bool paused) {
@@ -217,18 +214,10 @@ int common_hal_rgbmatrix_rgbmatrix_get_height(rgbmatrix_rgbmatrix_obj_t* self) {
 }
 
 void *common_hal_rgbmatrix_allocator_impl(size_t sz) {
-    if (gc_alloc_possible()) {
-        return m_malloc_maybe(sz + sizeof(void*), true);
-    } else {
-        supervisor_allocation *allocation = allocate_memory(align32_size(sz), false, false);
-        return allocation ? allocation->ptr : NULL;
-    }
+    supervisor_allocation *allocation = allocate_memory(align32_size(sz), false, true);
+    return allocation ? allocation->ptr : NULL;
 }
 
 void common_hal_rgbmatrix_free_impl(void *ptr_in) {
-    supervisor_allocation *allocation = allocation_from_ptr(ptr_in);
-
-    if (allocation) {
-        free_memory(allocation);
-    }
+    free_memory(allocation_from_ptr(ptr_in));
 }

From 7ca36d45a4ffe40c70814d62cb970007b7f40fee Mon Sep 17 00:00:00 2001
From: Christian Walther <cwalther@gmx.ch>
Date: Fri, 23 Oct 2020 22:35:56 +0200
Subject: [PATCH 5/9] Fix align32_size().

It not only caused crashes with requests larger than 64K (can happen with RGBMatrix), but also generated a lot longer code than necessary.
---
 shared-module/usb_midi/__init__.c | 6 +++---
 supervisor/memory.h               | 7 ++-----
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/shared-module/usb_midi/__init__.c b/shared-module/usb_midi/__init__.c
index 5afdd18213265..3fb3f836cd763 100644
--- a/shared-module/usb_midi/__init__.c
+++ b/shared-module/usb_midi/__init__.c
@@ -40,9 +40,9 @@ supervisor_allocation* usb_midi_allocation;
 
 void usb_midi_init(void) {
     // TODO(tannewt): Make this dynamic.
-    uint16_t tuple_size = align32_size(sizeof(mp_obj_tuple_t) + sizeof(mp_obj_t*) * 2);
-    uint16_t portin_size = align32_size(sizeof(usb_midi_portin_obj_t));
-    uint16_t portout_size = align32_size(sizeof(usb_midi_portout_obj_t));
+    size_t tuple_size = align32_size(sizeof(mp_obj_tuple_t) + sizeof(mp_obj_t*) * 2);
+    size_t portin_size = align32_size(sizeof(usb_midi_portin_obj_t));
+    size_t portout_size = align32_size(sizeof(usb_midi_portout_obj_t));
 
     // For each embedded MIDI Jack in the descriptor we create a Port
     usb_midi_allocation = allocate_memory(tuple_size + portin_size + portout_size, false, false);
diff --git a/supervisor/memory.h b/supervisor/memory.h
index 4307e3f21d556..0f820eac1c475 100755
--- a/supervisor/memory.h
+++ b/supervisor/memory.h
@@ -64,11 +64,8 @@ supervisor_allocation* allocate_remaining_memory(void);
 // supervisor_move_memory().
 supervisor_allocation* allocate_memory(uint32_t length, bool high_address, bool movable);
 
-static inline uint16_t align32_size(uint16_t size) {
-    if (size % 4 != 0) {
-        return (size & 0xfffc) + 0x4;
-    }
-    return size;
+static inline size_t align32_size(size_t size) {
+    return (size + 3) & ~3;
 }
 
 size_t get_allocation_length(supervisor_allocation* allocation);

From 993a581f5e8c039b562bbacdf17615162d8a85f6 Mon Sep 17 00:00:00 2001
From: Christian Walther <cwalther@gmx.ch>
Date: Wed, 28 Oct 2020 21:50:28 +0100
Subject: [PATCH 6/9] Make CIRCUITPY_SUPERVISOR_ALLOC_COUNT dependent on
 enabled features.

Avoids wasted memory and makes it easier to keep track of who needs how much for future additions.
---
 supervisor/shared/memory.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/supervisor/shared/memory.c b/supervisor/shared/memory.c
index 2be3b42d63f94..1760b9bd67a70 100755
--- a/supervisor/shared/memory.c
+++ b/supervisor/shared/memory.c
@@ -32,7 +32,32 @@
 #include "py/gc.h"
 #include "supervisor/shared/display.h"
 
-#define CIRCUITPY_SUPERVISOR_ALLOC_COUNT (12)
+enum {
+    CIRCUITPY_SUPERVISOR_ALLOC_COUNT =
+    // stack + heap
+    2
+#ifdef EXTERNAL_FLASH_DEVICES
+    + 1
+#endif
+#if CIRCUITPY_USB_MIDI
+    + 1
+#endif
+#if CIRCUITPY_DISPLAYIO
+    #if CIRCUITPY_TERMINALIO
+        + 1
+    #endif
+    + CIRCUITPY_DISPLAY_LIMIT * (
+        // Maximum needs of one display: max(4 if RGBMATRIX, 1 if SHARPDISPLAY, 0)
+        #if CIRCUITPY_RGBMATRIX
+            4
+        #elif CIRCUITPY_SHARPDISPLAY
+            1
+        #else
+            0
+        #endif
+    )
+#endif
+};
 
 // The lowest two bits of a valid length are always zero, so we can use them to mark an allocation
 // as a hole (freed by the client but not yet reclaimed into the free middle) and as movable.

From 9ecaa16eced40f3f56ba67737aed3f0950c5164f Mon Sep 17 00:00:00 2001
From: Christian Walther <cwalther@gmx.ch>
Date: Sun, 29 Nov 2020 16:04:31 +0100
Subject: [PATCH 7/9] Unify redundant low/high_address computation to save a
 bit of code size.

---
 supervisor/shared/memory.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/supervisor/shared/memory.c b/supervisor/shared/memory.c
index 1760b9bd67a70..acace7f890fc7 100755
--- a/supervisor/shared/memory.c
+++ b/supervisor/shared/memory.c
@@ -134,9 +134,7 @@ supervisor_allocation* allocation_from_ptr(void *ptr) {
 }
 
 supervisor_allocation* allocate_remaining_memory(void) {
-    uint32_t* low_address = low_head ? low_head->data + low_head->length / 4 : port_heap_get_bottom();
-    uint32_t* high_address = high_head ? (uint32_t*)high_head : port_heap_get_top();
-    return allocate_memory((high_address - low_address) * 4 - sizeof(supervisor_allocation_node), false, false);
+    return allocate_memory((uint32_t)-1, false, false);
 }
 
 static supervisor_allocation_node* find_hole(supervisor_allocation_node* node, size_t length) {
@@ -152,6 +150,12 @@ static supervisor_allocation_node* allocate_memory_node(uint32_t length, bool hi
     // supervisor_move_memory() currently does not support movable allocations on the high side, it
     // must be extended first if this is ever needed.
     assert(!(high && movable));
+    uint32_t* low_address = low_head ? low_head->data + low_head->length / 4 : port_heap_get_bottom();
+    uint32_t* high_address = high_head ? (uint32_t*)high_head : port_heap_get_top();
+    // Special case for allocate_remaining_memory(), avoids computing low/high_address twice.
+    if (length == (uint32_t)-1) {
+        length = (high_address - low_address) * 4 - sizeof(supervisor_allocation_node);
+    }
     if (length == 0 || length % 4 != 0) {
         return NULL;
     }
@@ -159,8 +163,6 @@ static supervisor_allocation_node* allocate_memory_node(uint32_t length, bool hi
     supervisor_allocation_node* node = find_hole(high ? high_head : low_head, length);
     if (!node) {
         // 2. Enough free space in the middle?
-        uint32_t* low_address = low_head ? low_head->data + low_head->length / 4 : port_heap_get_bottom();
-        uint32_t* high_address = high_head ? (uint32_t*)high_head : port_heap_get_top();
         if ((high_address - low_address) * 4 >= (int32_t)(sizeof(supervisor_allocation_node) + length)) {
             if (high) {
                 high_address -= (sizeof(supervisor_allocation_node) + length) / 4;

From 11ed6f86f02f61ace55951d2fff62a9d36cf3ac7 Mon Sep 17 00:00:00 2001
From: Christian Walther <cwalther@gmx.ch>
Date: Sun, 29 Nov 2020 16:27:36 +0100
Subject: [PATCH 8/9] Optimize out allocation moving code on boards that don't
 need it.

When no features are enabled that use movable allocations, supervisor_move_memory() is not needed.
---
 supervisor/shared/memory.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/supervisor/shared/memory.c b/supervisor/shared/memory.c
index acace7f890fc7..bc82804b70ef8 100755
--- a/supervisor/shared/memory.c
+++ b/supervisor/shared/memory.c
@@ -33,7 +33,7 @@
 #include "supervisor/shared/display.h"
 
 enum {
-    CIRCUITPY_SUPERVISOR_ALLOC_COUNT =
+    CIRCUITPY_SUPERVISOR_IMMOVABLE_ALLOC_COUNT =
     // stack + heap
     2
 #ifdef EXTERNAL_FLASH_DEVICES
@@ -42,6 +42,9 @@ enum {
 #if CIRCUITPY_USB_MIDI
     + 1
 #endif
+    ,
+    CIRCUITPY_SUPERVISOR_MOVABLE_ALLOC_COUNT =
+    0
 #if CIRCUITPY_DISPLAYIO
     #if CIRCUITPY_TERMINALIO
         + 1
@@ -57,6 +60,8 @@ enum {
         #endif
     )
 #endif
+    ,
+    CIRCUITPY_SUPERVISOR_ALLOC_COUNT = CIRCUITPY_SUPERVISOR_IMMOVABLE_ALLOC_COUNT + CIRCUITPY_SUPERVISOR_MOVABLE_ALLOC_COUNT
 };
 
 // The lowest two bits of a valid length are always zero, so we can use them to mark an allocation
@@ -147,6 +152,9 @@ static supervisor_allocation_node* find_hole(supervisor_allocation_node* node, s
 }
 
 static supervisor_allocation_node* allocate_memory_node(uint32_t length, bool high, bool movable) {
+    if (CIRCUITPY_SUPERVISOR_MOVABLE_ALLOC_COUNT == 0) {
+        assert(!movable);
+    }
     // supervisor_move_memory() currently does not support movable allocations on the high side, it
     // must be extended first if this is ever needed.
     assert(!(high && movable));
@@ -223,6 +231,11 @@ size_t get_allocation_length(supervisor_allocation* allocation) {
 }
 
 void supervisor_move_memory(void) {
+    // This whole function is not needed when there are no movable allocations, let it be optimized
+    // out.
+    if (CIRCUITPY_SUPERVISOR_MOVABLE_ALLOC_COUNT == 0) {
+        return;
+    }
     // This must be called exactly after freeing the heap, so that the embedded allocations, if any,
     // are now in the free region.
     assert(MP_STATE_VM(first_embedded_allocation) == NULL || (low_head < MP_STATE_VM(first_embedded_allocation) && MP_STATE_VM(first_embedded_allocation) < high_head));

From d6f8a43f6cc755b90376a66b75818744fbe63320 Mon Sep 17 00:00:00 2001
From: Christian Walther <cwalther@gmx.ch>
Date: Mon, 30 Nov 2020 23:33:07 +0100
Subject: [PATCH 9/9] Eliminate goto.

---
 supervisor/shared/memory.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/supervisor/shared/memory.c b/supervisor/shared/memory.c
index bc82804b70ef8..480c322b0112e 100755
--- a/supervisor/shared/memory.c
+++ b/supervisor/shared/memory.c
@@ -105,23 +105,23 @@ void free_memory(supervisor_allocation* allocation) {
     else {
         // Check if it's in the list of embedded allocations.
         supervisor_allocation_node** emb = &MP_STATE_VM(first_embedded_allocation);
-        while (*emb != NULL) {
-            if (*emb == node) {
-                // Found, remove it from the list.
-                *emb = node->next;
-                m_free(node
+        while (*emb != NULL && *emb != node) {
+            emb = &((*emb)->next);
+        }
+        if (*emb != NULL) {
+            // Found, remove it from the list.
+            *emb = node->next;
+            m_free(node
 #if MICROPY_MALLOC_USES_ALLOCATED_SIZE
-                    , sizeof(supervisor_allocation_node) + (node->length & ~FLAGS)
+                , sizeof(supervisor_allocation_node) + (node->length & ~FLAGS)
 #endif
-                );
-                goto done;
-            }
-            emb = &((*emb)->next);
+            );
+        }
+        else {
+            // Else it must be within the low or high ranges and becomes a hole.
+            node->length = ((node->length & ~FLAGS) | HOLE);
         }
-        // Else it must be within the low or high ranges and becomes a hole.
-        node->length = ((node->length & ~FLAGS) | HOLE);
     }
-done:
     allocation->ptr = NULL;
 }