Skip to content

Commit 4c15bbf

Browse files
committed
extending khash-maps with tracemalloc ability (works for >=Py3.6), similar to the way numpy does
1 parent 52a1725 commit 4c15bbf

File tree

5 files changed

+125
-13
lines changed

5 files changed

+125
-13
lines changed

pandas/_libs/hashtable.pyx

+5-1
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,14 @@ cnp.import_array()
1313

1414

1515
from pandas._libs cimport util
16-
from pandas._libs.khash cimport kh_str_t, khiter_t
16+
from pandas._libs.khash cimport KHASH_TRACE_DOMAIN, kh_str_t, khiter_t
1717
from pandas._libs.missing cimport checknull
1818

1919

20+
def get_hashtable_trace_domain():
21+
return KHASH_TRACE_DOMAIN
22+
23+
2024
cdef int64_t NPY_NAT = util.get_nat()
2125
SIZE_HINT_LIMIT = (1 << 20) + 7
2226

pandas/_libs/khash.pxd

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ from numpy cimport (
1414

1515

1616
cdef extern from "khash_python.h":
17+
const int KHASH_TRACE_DOMAIN
18+
1719
ctypedef uint32_t khint_t
1820
ctypedef khint_t khiter_t
1921

pandas/_libs/src/klib/khash.h

+28-10
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,24 @@ int main() {
115115
#include "../inline_helper.h"
116116

117117

118+
// hooks for memory allocator, C-runtime allocator used per default
119+
#ifndef KHASH_MALLOC
120+
#define KHASH_MALLOC malloc
121+
#endif
122+
123+
#ifndef KHASH_REALLOC
124+
#define KHASH_REALLOC realloc
125+
#endif
126+
127+
#ifndef KHASH_CALLOC
128+
#define KHASH_CALLOC calloc
129+
#endif
130+
131+
#ifndef KHASH_FREE
132+
#define KHASH_FREE free
133+
#endif
134+
135+
118136
#if UINT_MAX == 0xffffffffu
119137
typedef unsigned int khint32_t;
120138
#elif ULONG_MAX == 0xffffffffu
@@ -265,14 +283,14 @@ static const double __ac_HASH_UPPER = 0.77;
265283
khval_t *vals; \
266284
} kh_##name##_t; \
267285
SCOPE kh_##name##_t *kh_init_##name(void) { \
268-
return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
286+
return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \
269287
} \
270288
SCOPE void kh_destroy_##name(kh_##name##_t *h) \
271289
{ \
272290
if (h) { \
273-
free(h->keys); free(h->flags); \
274-
free(h->vals); \
275-
free(h); \
291+
KHASH_FREE(h->keys); KHASH_FREE(h->flags); \
292+
KHASH_FREE(h->vals); \
293+
KHASH_FREE(h); \
276294
} \
277295
} \
278296
SCOPE void kh_clear_##name(kh_##name##_t *h) \
@@ -305,11 +323,11 @@ static const double __ac_HASH_UPPER = 0.77;
305323
if (new_n_buckets < 4) new_n_buckets = 4; \
306324
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
307325
else { /* hash table size to be changed (shrink or expand); rehash */ \
308-
new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
326+
new_flags = (khint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
309327
memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
310328
if (h->n_buckets < new_n_buckets) { /* expand */ \
311-
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
312-
if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
329+
h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
330+
if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
313331
} /* otherwise shrink */ \
314332
} \
315333
} \
@@ -342,10 +360,10 @@ static const double __ac_HASH_UPPER = 0.77;
342360
} \
343361
} \
344362
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
345-
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
346-
if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
363+
h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
364+
if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
347365
} \
348-
free(h->flags); /* free the working space */ \
366+
KHASH_FREE(h->flags); /* free the working space */ \
349367
h->flags = new_flags; \
350368
h->n_buckets = new_n_buckets; \
351369
h->n_occupied = h->size; \

pandas/_libs/src/klib/khash_python.h

+55-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,59 @@
11
#include <string.h>
22
#include <Python.h>
33

4+
// khash should report usage to tracemalloc
5+
#if PY_VERSION_HEX >= 0x03060000
6+
#include <pymem.h>
7+
#if PY_VERSION_HEX < 0x03070000
8+
#define PyTraceMalloc_Track _PyTraceMalloc_Track
9+
#define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack
10+
#endif
11+
#else
12+
#define PyTraceMalloc_Track(...)
13+
#define PyTraceMalloc_Untrack(...)
14+
#endif
15+
16+
17+
static const int KHASH_TRACE_DOMAIN = 424242;
18+
void *traced_malloc(size_t size){
19+
void * ptr = malloc(size);
20+
if(ptr!=NULL){
21+
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
22+
}
23+
return ptr;
24+
}
25+
26+
void *traced_calloc(size_t num, size_t size){
27+
void * ptr = calloc(num, size);
28+
if(ptr!=NULL){
29+
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size);
30+
}
31+
return ptr;
32+
}
33+
34+
void *traced_realloc(void* old_ptr, size_t size){
35+
void * ptr = realloc(old_ptr, size);
36+
if(ptr!=NULL){
37+
if(old_ptr != ptr){
38+
PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr);
39+
}
40+
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
41+
}
42+
return ptr;
43+
}
44+
45+
void traced_free(void* ptr){
46+
if(ptr!=NULL){
47+
PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr);
48+
}
49+
free(ptr);
50+
}
51+
52+
53+
#define KHASH_MALLOC traced_malloc
54+
#define KHASH_REALLOC traced_realloc
55+
#define KHASH_CALLOC traced_calloc
56+
#define KHASH_FREE traced_free
457
#include "khash.h"
558

659
// Previously we were using the built in cpython hash function for doubles
@@ -128,7 +181,7 @@ typedef struct {
128181
typedef kh_str_starts_t* p_kh_str_starts_t;
129182

130183
p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) {
131-
kh_str_starts_t *result = (kh_str_starts_t*)calloc(1, sizeof(kh_str_starts_t));
184+
kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t));
132185
result->table = kh_init_str();
133186
return result;
134187
}
@@ -151,7 +204,7 @@ khint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const
151204

152205
void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
153206
kh_destroy_str(table->table);
154-
free(table);
207+
KHASH_FREE(table);
155208
}
156209

157210
void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khint_t val) {

pandas/tests/libs/test_hashtable.py

+35
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from contextlib import contextmanager
2+
import tracemalloc
3+
14
import numpy as np
25
import pytest
36

@@ -6,6 +9,23 @@
69
import pandas._testing as tm
710

811

12+
@contextmanager
13+
def activated_tracemalloc():
14+
tracemalloc.start()
15+
try:
16+
yield
17+
finally:
18+
tracemalloc.stop()
19+
20+
21+
def get_allocated_khash_memory():
22+
snapshot = tracemalloc.take_snapshot()
23+
snapshot = snapshot.filter_traces(
24+
(tracemalloc.DomainFilter(True, ht.get_hashtable_trace_domain()),)
25+
)
26+
return sum(map(lambda x: x.size, snapshot.traces))
27+
28+
929
@pytest.mark.parametrize(
1030
"table_type, dtype",
1131
[
@@ -101,6 +121,21 @@ def test_unique(self, table_type, dtype):
101121
unique = table.unique(keys)
102122
tm.assert_numpy_array_equal(unique, expected)
103123

124+
def test_tracemalloc_works(self, table_type, dtype):
125+
if dtype in (np.int8, np.uint8):
126+
N = 256
127+
else:
128+
N = 30000
129+
keys = np.arange(N).astype(dtype)
130+
with activated_tracemalloc():
131+
table = table_type()
132+
table.map_locations(keys)
133+
used = get_allocated_khash_memory()
134+
my_size = table.sizeof()
135+
assert used == my_size
136+
del table
137+
assert get_allocated_khash_memory() == 0
138+
104139

105140
@pytest.mark.parametrize(
106141
"table_type, dtype",

0 commit comments

Comments
 (0)