Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)
- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`)
- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`)
- Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`)

.. ---------------------------------------------------------------------------
.. _whatsnew_212.bug_fixes:
Expand Down
24 changes: 18 additions & 6 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -890,17 +890,29 @@ cdef class BlockValuesRefs:
"""
cdef:
public list referenced_blocks
public int clear_counter

def __cinit__(self, blk: Block | None = None) -> None:
if blk is not None:
self.referenced_blocks = [weakref.ref(blk)]
else:
self.referenced_blocks = []

def _clear_dead_references(self) -> None:
self.referenced_blocks = [
ref for ref in self.referenced_blocks if ref() is not None
]
self.clear_counter = 500 # set reasonably high

def _clear_dead_references(self, force=False) -> None:
# Use exponential backoff to decide when we want to clear references
# if force=False. Clearing for every insertion causes slowdowns if
# all these objects stay alive, e.g. df.items() for wide DataFrames
# see GH#55245 and GH#55008
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have a reference issue to eventually change this to a WeakSet-like implementation? IIRC I saw that discussed somewhere

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll open an issue about that as a follow up when this is in

if force or len(self.referenced_blocks) > self.clear_counter:
self.referenced_blocks = [
ref for ref in self.referenced_blocks if ref() is not None
]
nr_of_refs = len(self.referenced_blocks)
if nr_of_refs < self.clear_counter // 2:
self.clear_counter = max(self.clear_counter // 2, 500)
Copy link
Contributor

@wangwillian0 wangwillian0 Oct 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest a shrink factor of 4 or more. If it's the same as the growth factor it can create a few corner cases that will still have O(n^2). e.g. length going back and forth between (500*2^n)-1 and (500*2^n)+1

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couldn't this happen as well for a shrink factor of 4? And this would only happen If we have this interleaved with inlace modifications, e.g if force=True, correct? Merging for now, but happy to follow up

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Merging for now, but happy to follow up

+1

Copy link
Contributor

@wangwillian0 wangwillian0 Oct 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a factor of 4 you would need to change the length to the extremes of the range [500*2^(n-1); 500*2^n], which is at least 500 (and more for a larger n), this is much better than triggering the slow operation on just adding and removing 3 references.

elif nr_of_refs > self.clear_counter:
self.clear_counter = max(self.clear_counter * 2, nr_of_refs)

def add_reference(self, blk: Block) -> None:
"""Adds a new reference to our reference collection.
Expand Down Expand Up @@ -934,6 +946,6 @@ cdef class BlockValuesRefs:
-------
bool
"""
self._clear_dead_references()
self._clear_dead_references(force=True)
# Checking for more references than block pointing to itself
return len(self.referenced_blocks) > 1
29 changes: 29 additions & 0 deletions pandas/tests/copy_view/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,32 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype):
else:
for col in df.columns:
assert not np.shares_memory(get_array(df, col), get_array(df2, col))


def test_exponential_backoff():
df = DataFrame({"a": [1, 2, 3]})
for i in range(490):
df.copy(deep=False)

assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491

df = DataFrame({"a": [1, 2, 3]})
dfs = [df.copy(deep=False) for i in range(510)]

for i in range(20):
df.copy(deep=False)
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531
assert df._mgr.blocks[0].refs.clear_counter == 1000

for i in range(500):
df.copy(deep=False)

# Don't reduce since we still have over 500 objects alive
assert df._mgr.blocks[0].refs.clear_counter == 1000

dfs = dfs[:300]
for i in range(500):
df.copy(deep=False)

# Reduce since there are less than 500 objects alive
assert df._mgr.blocks[0].refs.clear_counter == 500