Skip to content

Commit 583b908

Browse files
CoW: Use weakref callbacks to track dead references
Co-authored-by: José Lucas Silva Mayer <[email protected]>
1 parent ef2c61a commit 583b908

File tree

2 files changed

+49
-33
lines changed

2 files changed

+49
-33
lines changed

pandas/_libs/internals.pyx

+27-24
Original file line numberDiff line numberDiff line change
@@ -890,29 +890,35 @@ cdef class BlockValuesRefs:
890890
"""
891891
cdef:
892892
public list referenced_blocks
893-
public int clear_counter
893+
public int dead_counter
894+
object __weakref__
895+
object _weakref_cb
894896

895897
def __cinit__(self, blk: Block | None = None) -> None:
896898
if blk is not None:
897-
self.referenced_blocks = [weakref.ref(blk)]
899+
self.referenced_blocks = [weakref.ref(blk, self._weakref_cb)]
898900
else:
899901
self.referenced_blocks = []
900-
self.clear_counter = 500 # set reasonably high
901-
902-
def _clear_dead_references(self, force=False) -> None:
903-
# Use exponential backoff to decide when we want to clear references
904-
# if force=False. Clearing for every insertion causes slowdowns if
905-
# all these objects stay alive, e.g. df.items() for wide DataFrames
906-
# see GH#55245 and GH#55008
907-
if force or len(self.referenced_blocks) > self.clear_counter:
908-
self.referenced_blocks = [
909-
ref for ref in self.referenced_blocks if ref() is not None
910-
]
911-
nr_of_refs = len(self.referenced_blocks)
912-
if nr_of_refs < self.clear_counter // 2:
913-
self.clear_counter = max(self.clear_counter // 2, 500)
914-
elif nr_of_refs > self.clear_counter:
915-
self.clear_counter = max(self.clear_counter * 2, nr_of_refs)
902+
903+
def __init__(self, blk: Block | None = None) -> None:
904+
def _weakref_cb(
905+
item: weakref.ref,
906+
selfref: weakref.ref = weakref.ref(self)
907+
) -> None:
908+
self = selfref()
909+
if self is not None:
910+
self.dead_counter += 1
911+
if self.dead_counter > 256:
912+
if self.dead_counter > len(self.referenced_blocks) // 2:
913+
self._clear_dead_references()
914+
self._weakref_cb = _weakref_cb
915+
916+
def _clear_dead_references(self) -> None:
917+
old_len = len(self.referenced_blocks)
918+
self.referenced_blocks = [
919+
ref for ref in self.referenced_blocks if ref() is not None
920+
]
921+
self.dead_counter = self.dead_counter - (old_len - len(self.referenced_blocks))
916922

917923
def add_reference(self, blk: Block) -> None:
918924
"""Adds a new reference to our reference collection.
@@ -922,8 +928,7 @@ cdef class BlockValuesRefs:
922928
blk : Block
923929
The block that the new references should point to.
924930
"""
925-
self._clear_dead_references()
926-
self.referenced_blocks.append(weakref.ref(blk))
931+
self.referenced_blocks.append(weakref.ref(blk, self._weakref_cb))
927932

928933
def add_index_reference(self, index: object) -> None:
929934
"""Adds a new reference to our reference collection when creating an index.
@@ -933,8 +938,7 @@ cdef class BlockValuesRefs:
933938
index : Index
934939
The index that the new reference should point to.
935940
"""
936-
self._clear_dead_references()
937-
self.referenced_blocks.append(weakref.ref(index))
941+
self.referenced_blocks.append(weakref.ref(index, self._weakref_cb))
938942

939943
def has_reference(self) -> bool:
940944
"""Checks if block has foreign references.
@@ -946,6 +950,5 @@ cdef class BlockValuesRefs:
946950
-------
947951
bool
948952
"""
949-
self._clear_dead_references(force=True)
950953
# Checking for more references than block pointing to itself
951-
return len(self.referenced_blocks) > 1
954+
return len(self.referenced_blocks) - self.dead_counter > 1

pandas/tests/copy_view/test_internals.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -121,31 +121,44 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype):
121121
assert not np.shares_memory(get_array(df, col), get_array(df2, col))
122122

123123

124-
def test_exponential_backoff():
125-
# GH#55518
124+
def test_clear_dead_references():
125+
# GH#55539
126126
df = DataFrame({"a": [1, 2, 3]})
127127
for i in range(490):
128128
df.copy(deep=False)
129129

130-
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491
130+
assert (
131+
len(df._mgr.blocks[0].refs.referenced_blocks)
132+
- df._mgr.blocks[0].refs.dead_counter
133+
== 1
134+
)
131135

132136
df = DataFrame({"a": [1, 2, 3]})
133137
dfs = [df.copy(deep=False) for i in range(510)]
134138

135139
for i in range(20):
136140
df.copy(deep=False)
137-
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531
138-
assert df._mgr.blocks[0].refs.clear_counter == 1000
141+
assert (
142+
len(df._mgr.blocks[0].refs.referenced_blocks)
143+
- df._mgr.blocks[0].refs.dead_counter
144+
== 511
145+
)
139146

140147
for i in range(500):
141148
df.copy(deep=False)
142149

143-
# Don't reduce since we still have over 500 objects alive
144-
assert df._mgr.blocks[0].refs.clear_counter == 1000
150+
assert (
151+
len(df._mgr.blocks[0].refs.referenced_blocks)
152+
- df._mgr.blocks[0].refs.dead_counter
153+
== 511
154+
)
145155

146156
dfs = dfs[:300]
147157
for i in range(500):
148158
df.copy(deep=False)
149159

150-
# Reduce since there are less than 500 objects alive
151-
assert df._mgr.blocks[0].refs.clear_counter == 500
160+
assert (
161+
len(df._mgr.blocks[0].refs.referenced_blocks)
162+
- df._mgr.blocks[0].refs.dead_counter
163+
== 301
164+
)

0 commit comments

Comments
 (0)