Skip to content

Commit f7fe5bf

Browse files
committed
optimize heap walks, by Sam Gross, upstream of python/cpython#114133
1 parent 855e3b2 commit f7fe5bf

File tree

2 files changed

+87
-25
lines changed

2 files changed

+87
-25
lines changed

src/heap.c

Lines changed: 73 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -528,46 +528,83 @@ void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) {
528528
}
529529

530530

531+
static void mi_get_fast_divisor(size_t divisor, uint64_t* magic, size_t* shift) {
532+
mi_assert_internal(divisor > 0 && divisor <= UINT32_MAX);
533+
*shift = 64 - mi_clz(divisor - 1);
534+
*magic = ((((uint64_t)1 << 32) * (((uint64_t)1 << *shift) - divisor)) / divisor + 1);
535+
}
536+
537+
static size_t mi_fast_divide(size_t n, uint64_t magic, size_t shift) {
538+
mi_assert_internal(n <= UINT32_MAX);
539+
return ((((uint64_t)n * magic) >> 32) + n) >> shift;
540+
}
541+
531542
bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) {
532543
mi_assert(area != NULL);
533544
if (area==NULL) return true;
534545
mi_assert(page != NULL);
535546
if (page == NULL) return true;
536547

537-
_mi_page_free_collect(page,true);
548+
_mi_page_free_collect(page,true); // collect both thread_delayed and local_free
538549
mi_assert_internal(page->local_free == NULL);
539550
if (page->used == 0) return true;
540551

541-
const size_t bsize = mi_page_block_size(page);
542-
const size_t ubsize = mi_page_usable_block_size(page); // without padding
543-
size_t psize;
544-
uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
552+
size_t psize;
553+
uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
554+
mi_heap_t* const heap = mi_page_heap(page);
555+
const size_t bsize = mi_page_block_size(page);
556+
const size_t ubsize = mi_page_usable_block_size(page); // without padding
545557

558+
// optimize page with one block
546559
if (page->capacity == 1) {
547-
// optimize page with one block
548560
mi_assert_internal(page->used == 1 && page->free == NULL);
549561
return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
550562
}
563+
mi_assert(bsize <= UINT32_MAX);
564+
565+
// optimize full pages
566+
if (page->used == page->capacity) {
567+
uint8_t* block = pstart;
568+
for (size_t i = 0; i < page->capacity; i++) {
569+
if (!visitor(heap, area, block, ubsize, arg)) return false;
570+
block += bsize;
571+
}
572+
return true;
573+
}
551574

552575
// create a bitmap of free blocks.
553576
#define MI_MAX_BLOCKS (MI_SMALL_PAGE_SIZE / sizeof(void*))
554-
uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
555-
memset(free_map, 0, sizeof(free_map));
577+
uintptr_t free_map[MI_MAX_BLOCKS / MI_INTPTR_BITS];
578+
const uintptr_t bmapsize = _mi_divide_up(page->capacity, MI_INTPTR_BITS);
579+
memset(free_map, 0, bmapsize * sizeof(intptr_t));
580+
if (page->capacity % MI_INTPTR_BITS != 0) {
581+
// mark left-over bits at the end as free
582+
size_t shift = (page->capacity % MI_INTPTR_BITS);
583+
uintptr_t mask = (UINTPTR_MAX << shift);
584+
free_map[bmapsize - 1] = mask;
585+
}
586+
587+
// fast repeated division by the block size
588+
uint64_t magic;
589+
size_t shift;
590+
mi_get_fast_divisor(bsize, &magic, &shift);
556591

557592
#if MI_DEBUG>1
558593
size_t free_count = 0;
559594
#endif
560-
for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
595+
for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) {
561596
#if MI_DEBUG>1
562597
free_count++;
563598
#endif
564599
mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
565600
size_t offset = (uint8_t*)block - pstart;
566601
mi_assert_internal(offset % bsize == 0);
567-
size_t blockidx = offset / bsize; // Todo: avoid division?
568-
mi_assert_internal( blockidx < MI_MAX_BLOCKS);
569-
size_t bitidx = (blockidx / sizeof(uintptr_t));
570-
size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
602+
mi_assert_internal(offset <= UINT32_MAX);
603+
size_t blockidx = mi_fast_divide(offset, magic, shift);
604+
mi_assert_internal(blockidx == offset / bsize);
605+
mi_assert_internal(blockidx < MI_MAX_BLOCKS);
606+
size_t bitidx = (blockidx / MI_INTPTR_BITS);
607+
size_t bit = blockidx - (bitidx * MI_INTPTR_BITS);
571608
free_map[bitidx] |= ((uintptr_t)1 << bit);
572609
}
573610
mi_assert_internal(page->capacity == (free_count + page->used));
@@ -576,19 +613,30 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_
576613
#if MI_DEBUG>1
577614
size_t used_count = 0;
578615
#endif
579-
for (size_t i = 0; i < page->capacity; i++) {
580-
size_t bitidx = (i / sizeof(uintptr_t));
581-
size_t bit = i - (bitidx * sizeof(uintptr_t));
582-
uintptr_t m = free_map[bitidx];
583-
if (bit == 0 && m == UINTPTR_MAX) {
584-
i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
616+
uint8_t* block = pstart;
617+
for (size_t i = 0; i < bmapsize; i++) {
618+
if (free_map[i] == 0) {
619+
// every block is in use
620+
for (size_t j = 0; j < MI_INTPTR_BITS; j++) {
621+
#if MI_DEBUG>1
622+
used_count++;
623+
#endif
624+
if (!visitor(heap, area, block, ubsize, arg)) return false;
625+
block += bsize;
626+
}
585627
}
586-
else if ((m & ((uintptr_t)1 << bit)) == 0) {
587-
#if MI_DEBUG>1
588-
used_count++;
589-
#endif
590-
uint8_t* block = pstart + (i * bsize);
591-
if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
628+
else {
629+
// visit the used blocks in the mask
630+
uintptr_t m = ~free_map[i];
631+
while (m != 0) {
632+
#if MI_DEBUG>1
633+
used_count++;
634+
#endif
635+
size_t bitidx = mi_ctz(m);
636+
if (!visitor(heap, area, block + (bitidx * bsize), ubsize, arg)) return false;
637+
m &= m - 1; // clear least significant bit
638+
}
639+
block += bsize * MI_INTPTR_BITS;
592640
}
593641
}
594642
mi_assert_internal(page->used == used_count);

test/test-stress.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,16 @@ static void free_items(void* p) {
129129
custom_free(p);
130130
}
131131

132+
/*
133+
static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
134+
(void)(heap); (void)(area);
135+
size_t* total = (size_t*)arg;
136+
if (block != NULL) {
137+
total += block_size;
138+
}
139+
return true;
140+
}
141+
*/
132142

133143
static void stress(intptr_t tid) {
134144
//bench_start_thread();
@@ -173,6 +183,10 @@ static void stress(intptr_t tid) {
173183
data[data_idx] = q;
174184
}
175185
}
186+
// walk the heap
187+
// size_t total = 0;
188+
// mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total);
189+
176190
// free everything that is left
177191
for (size_t i = 0; i < retain_top; i++) {
178192
free_items(retained[i]);

0 commit comments

Comments
 (0)