Skip to content

Commit 425c1db

Browse files
committed
PERF-#5247: Make MultiIndex use memory more efficiently
Signed-off-by: Dmitry Chigarev <[email protected]>
1 parent 21ab814 commit 425c1db

File tree

1 file changed

+13
-3
lines changed
  • modin/core/storage_formats/pandas

1 file changed

+13
-3
lines changed

modin/core/storage_formats/pandas/utils.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,21 @@ def split_result_of_axis_func_pandas(axis, num_splits, result, length_list=None)
100100
length_list = np.insert(length_list, obj=0, values=[0])
101101

102102
sums = np.cumsum(length_list)
103+
axis = 0 if isinstance(result, pandas.Series) else axis
103104
# We do this to restore block partitioning
104-
if axis == 0 or isinstance(result, pandas.Series):
105-
return [result.iloc[sums[i] : sums[i + 1]] for i in range(len(sums) - 1)]
105+
if axis == 0:
106+
chunked = [result.iloc[sums[i] : sums[i + 1]] for i in range(len(sums) - 1)]
106107
else:
107-
return [result.iloc[:, sums[i] : sums[i + 1]] for i in range(len(sums) - 1)]
108+
chunked = [result.iloc[:, sums[i] : sums[i + 1]] for i in range(len(sums) - 1)]
109+
110+
return [
111+
# Sliced MultiIndex still stores all encoded values of the original index, explicitly
112+
# asking it to drop unused values in order to save memory.
113+
chunk.set_axis(chunk.axes[axis].remove_unused_levels(), axis=axis, copy=False)
114+
if isinstance(chunk.axes[axis], pandas.MultiIndex)
115+
else chunk
116+
for chunk in chunked
117+
]
108118

109119

110120
def get_length_list(axis_len: int, num_splits: int) -> list:

0 commit comments

Comments
 (0)