Skip to content

Commit c86422a

Browse files
authored
FIX-#1927: Fix performance issue related to sparse attribute access (#2318)
Signed-off-by: Igoshev, Yaroslav <[email protected]>
1 parent 737ec34 commit c86422a

File tree

6 files changed

+138
-20
lines changed

6 files changed

+138
-20
lines changed

modin/pandas/accessor.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# Licensed to Modin Development Team under one or more contributor license agreements.
2+
# See the NOTICE file distributed with this work for additional information regarding
3+
# copyright ownership. The Modin Development Team licenses this file to you under the
4+
# Apache License, Version 2.0 (the "License"); you may not use this file except in
5+
# compliance with the License. You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under
10+
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific language
12+
# governing permissions and limitations under the License.
13+
14+
import pandas
15+
from pandas.core.arrays.sparse.dtype import SparseDtype
16+
17+
from modin.utils import _inherit_docstrings
18+
19+
20+
class BaseSparseAccessor:
21+
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
22+
23+
def __init__(self, data=None):
24+
self._parent = data
25+
self._validate(data)
26+
27+
def _validate(self, data):
28+
raise NotImplementedError
29+
30+
def _default_to_pandas(self, op, *args, **kwargs):
31+
return self._parent._default_to_pandas(
32+
lambda parent: op(parent.sparse, *args, **kwargs)
33+
)
34+
35+
36+
@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseFrameAccessor)
37+
class SparseFrameAccessor(BaseSparseAccessor):
38+
def _validate(self, data):
39+
dtypes = data.dtypes
40+
if not all(isinstance(t, SparseDtype) for t in dtypes):
41+
raise AttributeError(self._validation_msg)
42+
43+
@property
44+
def density(self):
45+
return self._parent._default_to_pandas(pandas.DataFrame.sparse).density
46+
47+
@classmethod
48+
def from_spmatrix(cls, data, index=None, columns=None):
49+
return cls._default_to_pandas(
50+
pandas.DataFrame.sparse.from_spmatrix, data, index=index, columns=columns
51+
)
52+
53+
def to_dense(self):
54+
return self._default_to_pandas(pandas.DataFrame.sparse.to_dense)
55+
56+
def to_coo(self):
57+
return self._default_to_pandas(pandas.DataFrame.sparse.to_coo)
58+
59+
60+
@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseAccessor)
61+
class SparseAccessor(BaseSparseAccessor):
62+
def _validate(self, data):
63+
if not isinstance(data.dtype, SparseDtype):
64+
raise AttributeError(self._validation_msg)
65+
66+
@property
67+
def density(self):
68+
return self._parent._default_to_pandas(pandas.Series.sparse).density
69+
70+
@property
71+
def fill_value(self):
72+
return self._parent._default_to_pandas(pandas.Series.sparse).fill_value
73+
74+
@property
75+
def npoints(self):
76+
return self._parent._default_to_pandas(pandas.Series.sparse).npoints
77+
78+
@property
79+
def sp_values(self):
80+
return self._parent._default_to_pandas(pandas.Series.sparse).sp_values
81+
82+
@classmethod
83+
def from_coo(cls, A, dense_index=False):
84+
return cls._default_to_pandas(
85+
pandas.Series.sparse.from_coo, A, dense_index=dense_index
86+
)
87+
88+
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
89+
return self._default_to_pandas(
90+
pandas.Series.sparse.to_coo,
91+
row_levels=row_levels,
92+
column_levels=column_levels,
93+
sort_labels=sort_labels,
94+
)
95+
96+
def to_dense(self):
97+
return self._default_to_pandas(pandas.Series.sparse.to_dense)
98+
99+
100+
@_inherit_docstrings(pandas.core.accessor.CachedAccessor)
101+
class CachedAccessor:
102+
def __init__(self, name: str, accessor) -> None:
103+
self._name = name
104+
self._accessor = accessor
105+
106+
def __get__(self, obj, cls):
107+
if obj is None:
108+
return self._accessor
109+
accessor_obj = self._accessor(obj)
110+
object.__setattr__(obj, self._name, accessor_obj)
111+
return accessor_obj

modin/pandas/dataframe.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
from .series import Series
5555
from .base import BasePandasDataset, _ATTRS_NO_LOOKUP
5656
from .groupby import DataFrameGroupBy
57+
from .accessor import CachedAccessor, SparseFrameAccessor
5758

5859

5960
@_inherit_docstrings(pandas.DataFrame, excluded=[pandas.DataFrame.__init__])
@@ -1594,9 +1595,7 @@ def set_index(
15941595
if not inplace:
15951596
return frame
15961597

1597-
@property
1598-
def sparse(self):
1599-
return self._default_to_pandas(pandas.DataFrame.sparse)
1598+
sparse = CachedAccessor("sparse", SparseFrameAccessor)
16001599

16011600
def squeeze(self, axis=None):
16021601
axis = self._get_axis_number(axis) if axis is not None else None

modin/pandas/series.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from .base import BasePandasDataset, _ATTRS_NO_LOOKUP
4242
from .iterator import PartitionIterator
4343
from .utils import from_pandas, is_scalar
44+
from .accessor import CachedAccessor, SparseAccessor
4445

4546

4647
@_inherit_docstrings(pandas.Series, excluded=[pandas.Series.__init__])
@@ -1187,9 +1188,7 @@ def sort_values(
11871188
result._query_compiler, inplace=inplace
11881189
)
11891190

1190-
@property
1191-
def sparse(self):
1192-
return self._default_to_pandas(pandas.Series.sparse)
1191+
sparse = CachedAccessor("sparse", SparseAccessor)
11931192

11941193
def squeeze(self, axis=None):
11951194
if axis is not None:

modin/pandas/test/dataframe/test_default.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,6 +1151,13 @@ def test___bool__(data):
11511151
eval_general(*create_test_dfs(data), lambda df: df.__bool__())
11521152

11531153

1154-
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
1155-
def test_hasattr_sparse(data):
1156-
eval_general(*create_test_dfs(data), lambda df: hasattr(df, "sparse"))
1154+
@pytest.mark.parametrize(
1155+
"is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"]
1156+
)
1157+
def test_hasattr_sparse(is_sparse_data):
1158+
modin_df, pandas_df = (
1159+
create_test_dfs(pandas.arrays.SparseArray(test_data["float_nan_data"].values()))
1160+
if is_sparse_data
1161+
else create_test_dfs(test_data["float_nan_data"])
1162+
)
1163+
eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse"))

modin/pandas/test/test_api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def test_top_level_api_equality():
4848
"DEFAULT_NPARTITIONS",
4949
"iterator",
5050
"series",
51+
"accessor",
5152
"base",
5253
"utils",
5354
"dataframe",

modin/pandas/test/test_series.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4397,17 +4397,18 @@ def test_encode(data, encoding_type):
43974397
df_equals(modin_result, pandas_result)
43984398

43994399

4400-
@pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys)
4401-
def test_hasattr_sparse(data):
4402-
modin_series, pandas_series = create_test_series(data)
4403-
try:
4404-
pandas_result = hasattr(pandas_series, "sparse")
4405-
except Exception as e:
4406-
with pytest.raises(type(e)):
4407-
hasattr(modin_series, "sparse")
4408-
else:
4409-
modin_result = hasattr(modin_series, "sparse")
4410-
assert modin_result == pandas_result
4400+
@pytest.mark.parametrize(
4401+
"is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"]
4402+
)
4403+
def test_hasattr_sparse(is_sparse_data):
4404+
modin_df, pandas_df = (
4405+
create_test_series(
4406+
pandas.arrays.SparseArray(test_data["float_nan_data"].values())
4407+
)
4408+
if is_sparse_data
4409+
else create_test_series(test_data["float_nan_data"])
4410+
)
4411+
eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse"))
44114412

44124413

44134414
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)