From 5bec6b31bac1fc3a81676e63db32d2297422917b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 24 May 2019 14:44:27 -0400 Subject: [PATCH 01/10] BUG: preserve categorical & sparse types when grouping / pivot preserve dtypes when applying a ufunc to a sparse dtype closes #18502 closes #23743 --- doc/source/whatsnew/v0.25.0.rst | 59 ++++++++++++++++++++ pandas/core/dtypes/cast.py | 8 ++- pandas/core/frame.py | 45 +++++++++++++++ pandas/core/generic.py | 24 ++------ pandas/core/groupby/generic.py | 11 +++- pandas/core/groupby/groupby.py | 27 +++++++-- pandas/core/groupby/ops.py | 6 +- pandas/core/internals/blocks.py | 16 +++++- pandas/core/nanops.py | 9 +-- pandas/core/series.py | 27 +++++++-- pandas/tests/groupby/test_function.py | 47 ++++++++-------- pandas/tests/groupby/test_nth.py | 19 ++++--- pandas/tests/resample/test_datetime_index.py | 6 ++ pandas/tests/sparse/frame/test_analytics.py | 16 +++++- pandas/tests/sparse/series/test_analytics.py | 16 ++++++ pandas/tests/sparse/test_groupby.py | 10 ++-- pandas/tests/sparse/test_pivot.py | 16 +++++- 17 files changed, 282 insertions(+), 80 deletions(-) create mode 100644 pandas/tests/sparse/series/test_analytics.py diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a6b74865f6619..6a792500029e0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -316,6 +316,65 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t s s.str.startswith(b'a') +<<<<<<< HEAD +======= +.. _whatsnew_0250.api_breaking.ufuncs: + +ufuncs on Extension Dtype +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Operations with ``numpy`` ufuncs on Extension Arrays, including Sparse Dtypes will now coerce the +resulting dtypes to same as the input dtype; previously this would coerce to a dense dtype. (:issue:`23743`) + +.. ipython:: python + + df = pd.DataFrame({'A': pd.Series([1, np.nan, 3], dtype=pd.SparseDtype('float64', np.nan))}) + df + df.dtypes + +*Previous Behavior*: + +.. code-block:: python + + In [3]: np.sqrt(df).dtypes + Out[3]: + A float64 + dtype: object + +*New Behavior*: + +.. ipython:: python + + np.sqrt(df).dtypes + +.. _whatsnew_0250.api_breaking.groupby_categorical: + +Categorical dtypes are preserved during groupby +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`) + +.. ipython:: python + + df = pd.DataFrame({'payload': [-1,-2,-1,-2], + 'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)}) + df + df.dtypes + +*Previous Behavior*: + +.. code-block:: python + + In [5]: df.groupby('payload').first().col.dtype + Out[5]: dtype('O') + +*New Behavior*: + +.. ipython:: python + + df.groupby('payload').first().col.dtype + + .. _whatsnew_0250.api_breaking.incompatible_index_unions: Incompatible Index Type Unions diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2f66e9ed46aa0..0268f8fbdf467 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -605,7 +605,7 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def astype_nansafe(arr, dtype, copy=True, skipna=False): +def astype_nansafe(arr, dtype, copy=True, skipna=False, casting='unsafe'): """ Cast the elements of an array to a given dtype a nan-safe manner. @@ -616,8 +616,10 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): copy : bool, default True If False, a view will be attempted but may fail, if e.g. the item sizes don't align. - skipna: bool, default False + skipna : bool, default False Whether or not we should skip NaN when casting as a string-type. + casting : {‘no’, ‘equiv’, ‘safe’, ‘same_kind’, ‘unsafe’} + optional, default 'unsafe' Raises ------ @@ -703,7 +705,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): if copy or is_object_dtype(arr) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. - return arr.astype(dtype, copy=True) + return arr.astype(dtype, copy=True, casting=casting) return arr.view(dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6746844f4b1fa..ad5a04f8cb934 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2641,6 +2641,51 @@ def transpose(self, *args, **kwargs): T = property(transpose) + # ---------------------------------------------------------------------- + # Array Interface + + # This is also set in IndexOpsMixin + # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented + __array_priority__ = 1000 + + def __array__(self, dtype=None): + return com.values_from_object(self) + + def __array_wrap__(self, result: np.ndarray, context=None) -> 'DataFrame': + """ + We are called post ufunc; reconstruct the original object and dtypes. + + Parameters + ---------- + result : np.ndarray + context + + Returns + ------- + DataFrame + """ + + d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) + result = self._constructor(result, **d) + + # we try to cast extension array types back to the original + # TODO: this fails with duplicates, ugh + if self._data.any_extension_types: + result = result.astype(self.dtypes, + copy=False, + errors='ignore', + casting='same_kind') + + return result.__finalize__(self) + + # ideally we would define this to avoid the getattr checks, but + # is slower + # @property + # def __array_interface__(self): + # """ provide numpy array interface method """ + # values = self.values + # return dict(typestr=values.dtype.str,shape=values.shape,data=values) + # ---------------------------------------------------------------------- # Picklability diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 360576ffdb00a..b614298bb912c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1919,25 +1919,6 @@ def empty(self): # ---------------------------------------------------------------------- # Array Interface - # This is also set in IndexOpsMixin - # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented - __array_priority__ = 1000 - - def __array__(self, dtype=None): - return com.values_from_object(self) - - def __array_wrap__(self, result, context=None): - d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) - return self._constructor(result, **d).__finalize__(self) - - # ideally we would define this to avoid the getattr checks, but - # is slower - # @property - # def __array_interface__(self): - # """ provide numpy array interface method """ - # values = self.values - # return dict(typestr=values.dtype.str,shape=values.shape,data=values) - def to_dense(self): """ Return dense representation of NDFrame (as opposed to sparse). @@ -5693,6 +5674,11 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): **kwargs) return self._constructor(new_data).__finalize__(self) + if not results: + if copy: + self = self.copy() + return self + # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) result.columns = self.columns diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35ffa552913ae..20b7a595f49e9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -156,12 +156,19 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, obj = self.obj[data.items[locs]] s = groupby(obj, self.grouper) - result = s.aggregate(lambda x: alt(x, axis=self.axis)) + try: + result = s.aggregate(lambda x: alt(x, axis=self.axis)) + except Exception: + # we may have an exception in trying to aggregate + # continue and exclude the block + pass finally: + dtype = block.values.dtype + # see if we can cast the block back to the original dtype - result = block._try_coerce_and_cast_result(result) + result = block._try_coerce_and_cast_result(result, dtype=dtype) newb = block.make_block(result) new_items.append(locs) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 43950f2f503c8..2c042d55dfa4b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -786,6 +786,8 @@ def _try_cast(self, result, obj, numeric_only=False): elif is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. + + # return the same type (Series) as our caller try: result = obj._values._from_sequence(result, dtype=dtype) except Exception: @@ -1157,7 +1159,8 @@ def mean(self, *args, **kwargs): """ nv.validate_groupby_func('mean', args, kwargs, ['numeric_only']) try: - return self._cython_agg_general('mean', **kwargs) + return self._cython_agg_general( + 'mean', alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1179,7 +1182,11 @@ def median(self, **kwargs): Median of values within each group. """ try: - return self._cython_agg_general('median', **kwargs) + return self._cython_agg_general( + 'median', + alt=lambda x, + axis: Series(x).median(**kwargs), + **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1235,7 +1242,10 @@ def var(self, ddof=1, *args, **kwargs): nv.validate_groupby_func('var', args, kwargs) if ddof == 1: try: - return self._cython_agg_general('var', **kwargs) + return self._cython_agg_general( + 'var', + alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), + **kwargs) except Exception: f = lambda x: x.var(ddof=ddof, **kwargs) with _group_selection_context(self): @@ -1263,7 +1273,6 @@ def sem(self, ddof=1): Series or DataFrame Standard error of the mean of values within each group. """ - return self.std(ddof=ddof) / np.sqrt(self.count()) @Substitution(name='groupby') @@ -1320,6 +1329,16 @@ def f(self, **kwargs): except Exception: result = self.aggregate( lambda x: npfunc(x, axis=self.axis)) + + # coerce the columns if we can + if isinstance(result, DataFrame): + for col in result.columns: + result[col] = self._try_cast( + result[col], self.obj[col]) + else: + result = self._try_cast( + result, self.obj) + if _convert: result = result._convert(datetime=True) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 010047a8be4ed..38478be5a8e07 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -19,7 +19,7 @@ from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, ensure_int_or_float, ensure_object, ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype, - is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, + is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, is_timedelta64_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import _maybe_fill, isna @@ -451,9 +451,9 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, # categoricals are only 1d, so we # are not setup for dim transforming - if is_categorical_dtype(values): + if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( - "categoricals are not support in cython ops ATM") + "{} are not support in cython ops".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4cc6c86417b3b..429b2b064c702 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -600,7 +600,8 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, values = self.get_values(dtype=dtype) # _astype_nansafe works fine with 1-d only - values = astype_nansafe(values.ravel(), dtype, copy=True) + values = astype_nansafe( + values.ravel(), dtype, copy=True, **kwargs) # TODO(extension) # should we make this attribute? @@ -1767,6 +1768,19 @@ def _slice(self, slicer): return self.values[slicer] + def _try_cast_result(self, result, dtype=None): + """ + if we have an operation that operates on for example floats + we want to try to cast back to our EA here if possible + """ + try: + result = self._holder._from_sequence( + np.asarray(result).ravel(), dtype=dtype) + except Exception: + pass + + return result + def formatting_values(self): # Deprecating the ability to override _formatting_values. # Do the warning here, it's only user in pandas, since we diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 7923e463c7719..24a28bf0005cb 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -72,11 +72,12 @@ def _f(*args, **kwargs): class bottleneck_switch: - def __init__(self, **kwargs): + def __init__(self, name=None, **kwargs): + self.name = name self.kwargs = kwargs def __call__(self, alt): - bn_name = alt.__name__ + bn_name = self.name or alt.__name__ try: bn_func = getattr(bn, bn_name) @@ -804,7 +805,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): - @bottleneck_switch() + + @bottleneck_switch(name='nan' + meth) def reduction(values, axis=None, skipna=True, mask=None): values, mask, dtype, dtype_max, fill_value = _get_values( @@ -824,7 +826,6 @@ def reduction(values, axis=None, skipna=True, mask=None): result = _wrap_results(result, dtype, fill_value) return _maybe_null_out(result, axis, mask, values.shape) - reduction.__name__ = 'nan' + meth return reduction diff --git a/pandas/core/series.py b/pandas/core/series.py index c4a449154860f..485c7db92bb2d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -762,12 +762,31 @@ def __array__(self, dtype=None): dtype = 'M8[ns]' return np.asarray(self.array, dtype) - def __array_wrap__(self, result, context=None): + def __array_wrap__(self, result: np.ndarray, context=None) -> 'Series': """ - Gets called after a ufunc. + We are called post ufunc; reconstruct the original object and dtypes. + + Parameters + ---------- + result : np.ndarray + context + + Returns + ------- + Series """ - return self._constructor(result, index=self.index, - copy=False).__finalize__(self) + + result = self._constructor(result, index=self.index, + copy=False) + + # we try to cast extension array types back to the original + if is_extension_array_dtype(self): + result = result.astype(self.dtype, + copy=False, + errors='ignore', + casting='same_kind') + + return result.__finalize__(self) def __array_prepare__(self, result, context=None): """ diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3d9bfcd126377..379b6db2b650e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -144,6 +144,7 @@ def test_arg_passthru(): index=Index([1, 2], name='group'), columns=['int', 'float', 'category_int', 'datetime', 'datetimetz', 'timedelta']) + for attr in ['mean', 'median']: f = getattr(df.groupby('group'), attr) result = f() @@ -459,35 +460,35 @@ def test_groupby_cumprod(): tm.assert_series_equal(actual, expected) -def test_ops_general(): - ops = [('mean', np.mean), - ('median', np.median), - ('std', np.std), - ('var', np.var), - ('sum', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), - ('count', np.size), ] +def scipy_sem(*args, **kwargs): try: from scipy.stats import sem + return sem(*args, ddof=1, **kwargs) except ImportError: - pass - else: - ops.append(('sem', sem)) + pytest.skip("No Scipy installed") + + +@pytest.mark.parametrize( + 'op,targop', + [('mean', np.mean), + ('median', np.median), + ('std', np.std), + ('var', np.var), + ('sum', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), + ('sem', scipy_sem)]) +def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - for op, targop in ops: - result = getattr(df.groupby(labels), op)().astype(float) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise + result = getattr(df.groupby(labels), op)().astype(float) + expected = df.groupby(labels).agg(targop) + tm.assert_frame_equal(result, expected) def test_max_nan_bug(): diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 6a08a8d79b63e..b174fb0e0b6f9 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -282,18 +282,21 @@ def test_first_last_tz(data, expected_first, expected_last): ]) def test_first_last_tz_multi_column(method, ts, alpha): # GH 21603 + category_string = pd.Series(list('abc')).astype( + 'category') df = pd.DataFrame({'group': [1, 1, 2], - 'category_string': pd.Series(list('abc')).astype( - 'category'), + 'category_string': category_string, 'datetimetz': pd.date_range('20130101', periods=3, tz='US/Eastern')}) result = getattr(df.groupby('group'), method)() - expepcted = pd.DataFrame({'category_string': [alpha, 'c'], - 'datetimetz': [ts, - Timestamp('2013-01-03', - tz='US/Eastern')]}, - index=pd.Index([1, 2], name='group')) - assert_frame_equal(result, expepcted) + expected = pd.DataFrame( + {'category_string': pd.Categorical( + [alpha, 'c'], dtype=category_string.dtype), + 'datetimetz': [ts, + Timestamp('2013-01-03', + tz='US/Eastern')]}, + index=pd.Index([1, 2], name='group')) + assert_frame_equal(result, expected) def test_nth_multi_index_as_expected(): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 5711174ef0c9f..830ba6062cc72 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -112,6 +112,12 @@ def test_resample_integerarray(): dtype="Int64") assert_series_equal(result, expected) + result = ts.resample('3T').mean() + expected = Series([1, 4, 7], + index=pd.date_range('1/1/2000', periods=3, freq='3T'), + dtype='Int64') + assert_series_equal(result, expected) + def test_resample_basic_grouper(series): s = series diff --git a/pandas/tests/sparse/frame/test_analytics.py b/pandas/tests/sparse/frame/test_analytics.py index ae97682f297ad..7054f9a9dd65f 100644 --- a/pandas/tests/sparse/frame/test_analytics.py +++ b/pandas/tests/sparse/frame/test_analytics.py @@ -1,7 +1,8 @@ import numpy as np import pytest -from pandas import DataFrame, SparseDataFrame, SparseSeries +from pandas import ( + DataFrame, Series, SparseDataFrame, SparseDtype, SparseSeries) from pandas.util import testing as tm @@ -39,3 +40,16 @@ def test_quantile_multi(): tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) + + +@pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=lambda x: x.__name__) +def test_ufunc(func): + # GH 23743 + # assert we preserve the incoming dtype on ufunc operation + df = DataFrame( + {'A': Series([1, np.nan, 3], dtype=SparseDtype('float64', np.nan))}) + result = func(df) + expected = DataFrame( + {'A': Series(func([1, np.nan, 3]), + dtype=SparseDtype('float64', np.nan))}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/sparse/series/test_analytics.py b/pandas/tests/sparse/series/test_analytics.py new file mode 100644 index 0000000000000..fe2eaf0e4de4c --- /dev/null +++ b/pandas/tests/sparse/series/test_analytics.py @@ -0,0 +1,16 @@ +import numpy as np +import pytest + +from pandas import Series, SparseDtype +from pandas.util import testing as tm + + +@pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=lambda x: x.__name__) +def test_ufunc(func): + # GH 23743 + # assert we preserve the incoming dtype on ufunc operation + s = Series([1, np.nan, 3], dtype=SparseDtype('float64', np.nan)) + result = func(s) + expected = Series(func([1, np.nan, 3]), + dtype=SparseDtype('float64', np.nan)) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py index 531a4360c78a2..bf6055bc12725 100644 --- a/pandas/tests/sparse/test_groupby.py +++ b/pandas/tests/sparse/test_groupby.py @@ -29,11 +29,10 @@ def test_first_last_nth(self): sparse_grouped_last = sparse_grouped.last() sparse_grouped_nth = sparse_grouped.nth(1) - dense_grouped_first = dense_grouped.first().to_sparse() - dense_grouped_last = dense_grouped.last().to_sparse() - dense_grouped_nth = dense_grouped.nth(1).to_sparse() + dense_grouped_first = pd.DataFrame(dense_grouped.first().to_sparse()) + dense_grouped_last = pd.DataFrame(dense_grouped.last().to_sparse()) + dense_grouped_nth = pd.DataFrame(dense_grouped.nth(1).to_sparse()) - # TODO: shouldn't these all be spares or not? tm.assert_frame_equal(sparse_grouped_first, dense_grouped_first) tm.assert_frame_equal(sparse_grouped_last, @@ -69,5 +68,6 @@ def test_groupby_includes_fill_value(fill_value): 'b': [fill_value, 1, fill_value, fill_value]}) sdf = df.to_sparse(fill_value=fill_value) result = sdf.groupby('a').sum() - expected = df.groupby('a').sum().to_sparse(fill_value=fill_value) + expected = pd.DataFrame(df.groupby('a').sum().to_sparse( + fill_value=fill_value)) tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 114e7b4bacd94..5c070ba5a9a5b 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -47,10 +47,20 @@ def test_pivot_table(self): # values='E', aggfunc='sum') # tm.assert_frame_equal(res_sparse, res_dense) - def test_pivot_table_multi(self): + @pytest.mark.parametrize( + 'func', + ['mean', + 'std', + 'var', + 'sem', + 'median', + 'first', + 'last']) + def test_pivot_table_multi(self, func): + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values=['D', 'E']) + values=['D', 'E'], aggfunc=func) res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values=['D', 'E']) + values=['D', 'E'], aggfunc=func) res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]")) tm.assert_frame_equal(res_sparse, res_dense) From d1490a2ef5ad22aeb72a6bb31ca16491bb2b96d6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 28 May 2019 22:09:08 -0400 Subject: [PATCH 02/10] lint --- doc/source/whatsnew/v0.25.0.rst | 9 ++++++--- pandas/core/groupby/groupby.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 6a792500029e0..5c21f446b84a7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -328,7 +328,9 @@ resulting dtypes to same as the input dtype; previously this would coerce to a d .. ipython:: python - df = pd.DataFrame({'A': pd.Series([1, np.nan, 3], dtype=pd.SparseDtype('float64', np.nan))}) + df = pd.DataFrame( + {'A': pd.Series([1, np.nan, 3], + dtype=pd.SparseDtype('float64', np.nan))}) df df.dtypes @@ -356,8 +358,9 @@ Previously, columns that were categorical, but not the groupby key(s) would be c .. ipython:: python - df = pd.DataFrame({'payload': [-1,-2,-1,-2], - 'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)}) + df = pd.DataFrame( + {'payload': [-1, -2, -1, -2], + 'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)}) df df.dtypes diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2c042d55dfa4b..6274f45427db6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1185,7 +1185,7 @@ def median(self, **kwargs): return self._cython_agg_general( 'median', alt=lambda x, - axis: Series(x).median(**kwargs), + axis: Series(x).median(axis=axis, **kwargs), **kwargs) except GroupByError: raise From 561e960fab5dddcbf86e5b34fa434ad778d0e983 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 29 May 2019 20:54:00 -0400 Subject: [PATCH 03/10] review comments --- pandas/core/frame.py | 7 ++++--- pandas/core/series.py | 7 ++++--- pandas/tests/groupby/test_function.py | 11 ++++------- pandas/tests/sparse/frame/test_analytics.py | 2 +- pandas/tests/sparse/series/test_analytics.py | 2 +- pandas/tests/sparse/test_pivot.py | 13 ++++++++----- 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ad5a04f8cb934..530590ea5dc45 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -16,7 +16,7 @@ import sys import warnings from textwrap import dedent -from typing import FrozenSet, List, Optional, Set, Type, Union +from typing import FrozenSet, List, Optional, Tuple, Set, Type, Union import numpy as np import numpy.ma as ma @@ -2651,14 +2651,15 @@ def transpose(self, *args, **kwargs): def __array__(self, dtype=None): return com.values_from_object(self) - def __array_wrap__(self, result: np.ndarray, context=None) -> 'DataFrame': + def __array_wrap__(self, result: np.ndarray, + context: Optional[Tuple] = None) -> 'DataFrame': """ We are called post ufunc; reconstruct the original object and dtypes. Parameters ---------- result : np.ndarray - context + context : tuple, optional Returns ------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 485c7db92bb2d..a15343f2806ba 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -6,6 +6,7 @@ from shutil import get_terminal_size from textwrap import dedent import warnings +from typing import Optional, Tuple import numpy as np @@ -762,20 +763,20 @@ def __array__(self, dtype=None): dtype = 'M8[ns]' return np.asarray(self.array, dtype) - def __array_wrap__(self, result: np.ndarray, context=None) -> 'Series': + def __array_wrap__(self, result: np.ndarray, + context: Optional[Tuple] = None) -> 'Series': """ We are called post ufunc; reconstruct the original object and dtypes. Parameters ---------- result : np.ndarray - context + context : tuple, optional Returns ------- Series """ - result = self._constructor(result, index=self.index, copy=False) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 379b6db2b650e..0ede4cab269f2 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -12,7 +12,7 @@ from pandas import ( DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna) import pandas.core.nanops as nanops -from pandas.util import testing as tm +from pandas.util import _test_decorators as td, testing as tm @pytest.mark.parametrize("agg_func", ['any', 'all']) @@ -461,11 +461,8 @@ def test_groupby_cumprod(): def scipy_sem(*args, **kwargs): - try: - from scipy.stats import sem - return sem(*args, ddof=1, **kwargs) - except ImportError: - pytest.skip("No Scipy installed") + from scipy.stats import sem + return sem(*args, ddof=1, **kwargs) @pytest.mark.parametrize( @@ -481,7 +478,7 @@ def scipy_sem(*args, **kwargs): ('first', lambda x: x.iloc[0]), ('last', lambda x: x.iloc[-1]), ('count', np.size), - ('sem', scipy_sem)]) + pytest.param('sem', scipy_sem, mark=td._skip_if_no_scipy)]) def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) diff --git a/pandas/tests/sparse/frame/test_analytics.py b/pandas/tests/sparse/frame/test_analytics.py index 7054f9a9dd65f..b187976703447 100644 --- a/pandas/tests/sparse/frame/test_analytics.py +++ b/pandas/tests/sparse/frame/test_analytics.py @@ -42,7 +42,7 @@ def test_quantile_multi(): tm.assert_sp_frame_equal(result, sparse_expected) -@pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=lambda x: x.__name__) +@pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=str) def test_ufunc(func): # GH 23743 # assert we preserve the incoming dtype on ufunc operation diff --git a/pandas/tests/sparse/series/test_analytics.py b/pandas/tests/sparse/series/test_analytics.py index fe2eaf0e4de4c..bac778f09fe66 100644 --- a/pandas/tests/sparse/series/test_analytics.py +++ b/pandas/tests/sparse/series/test_analytics.py @@ -5,7 +5,7 @@ from pandas.util import testing as tm -@pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=lambda x: x.__name__) +@pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=str) def test_ufunc(func): # GH 23743 # assert we preserve the incoming dtype on ufunc operation diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 5c070ba5a9a5b..fd29c918a7c93 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -56,11 +56,14 @@ def test_pivot_table(self): 'median', 'first', 'last']) - def test_pivot_table_multi(self, func): + @pytest.mark.parametrize('dropna', [True, False]) + def test_pivot_table_multi(self, func, dropna): - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values=['D', 'E'], aggfunc=func) - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values=['D', 'E'], aggfunc=func) + res_sparse = pd.pivot_table( + self.sparse, index='A', columns='B', + values=['D', 'E'], aggfunc=func, dropna=dropna) + res_dense = pd.pivot_table( + self.dense, index='A', columns='B', + values=['D', 'E'], aggfunc=func, dropna=dropna) res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]")) tm.assert_frame_equal(res_sparse, res_dense) From 28be4d9f21493c58dbe1d12b83166b327d20038b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 29 May 2019 21:20:53 -0400 Subject: [PATCH 04/10] use marks --- pandas/tests/groupby/test_function.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0ede4cab269f2..2a77b1b96a662 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -478,7 +478,9 @@ def scipy_sem(*args, **kwargs): ('first', lambda x: x.iloc[0]), ('last', lambda x: x.iloc[-1]), ('count', np.size), - pytest.param('sem', scipy_sem, mark=td._skip_if_no_scipy)]) + pytest.param( + 'sem', scipy_sem, marks=[pytest.mark.skipif( + td._skip_if_no_scipy(), reason='scipy not installed')])]) def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) From d6db2ea2eb67a7a831727bb1be5e093c1abc80ff Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 2 Jun 2019 16:00:36 -0400 Subject: [PATCH 05/10] review comments --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/series.py | 2 +- pandas/tests/sparse/test_pivot.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5c21f446b84a7..6dacb9dae834a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -323,7 +323,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t ufuncs on Extension Dtype ^^^^^^^^^^^^^^^^^^^^^^^^^ -Operations with ``numpy`` ufuncs on Extension Arrays, including Sparse Dtypes will now coerce the +Operations with ``numpy`` ufuncs on Extension Arrays, including Sparse Dtypes will now preserve the resulting dtypes to same as the input dtype; previously this would coerce to a dense dtype. (:issue:`23743`) .. ipython:: python diff --git a/pandas/core/series.py b/pandas/core/series.py index a15343f2806ba..59ae60cfb38d7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,8 +5,8 @@ from io import StringIO from shutil import get_terminal_size from textwrap import dedent -import warnings from typing import Optional, Tuple +import warnings import numpy as np diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index fd29c918a7c93..f545ce0310a2a 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -2,6 +2,7 @@ import pytest import pandas as pd +from pandas import _np_version_under1p17 import pandas.util.testing as tm @@ -53,7 +54,8 @@ def test_pivot_table(self): 'std', 'var', 'sem', - 'median', + pytest.param('median', marks=pytest.mark.xfail( + not _np_version_under1p17, reason="fails on numpy > 1.16")), 'first', 'last']) @pytest.mark.parametrize('dropna', [True, False]) From 7c29393a4545010cc0fc069c70b5c040bea0e23f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 2 Jun 2019 17:04:33 -0400 Subject: [PATCH 06/10] allow coercing casting --- pandas/core/arrays/sparse.py | 14 +++++++++++++- pandas/core/series.py | 5 +---- pandas/tests/sparse/frame/test_analytics.py | 12 ++++++++---- pandas/tests/sparse/series/test_analytics.py | 12 ++++++++---- 4 files changed, 30 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 3dda6868a80da..dadbd5e23dce9 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1926,8 +1926,20 @@ def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): index = _make_index(length, indices, kind) sparsified_values = arr[mask] + + # careful about casting here + # as we could easily specify a type that cannot hold the resulting values + # e.g. integer when we have floats if dtype is not None: - sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) + try: + sparsified_values = astype_nansafe( + sparsified_values, dtype=dtype, casting='same_kind') + except TypeError: + dtype = 'float64' + sparsified_values = astype_nansafe( + sparsified_values, dtype=dtype, casting='unsafe') + + # TODO: copy return sparsified_values, index, fill_value diff --git a/pandas/core/series.py b/pandas/core/series.py index 59ae60cfb38d7..2432d801fe07e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -782,10 +782,7 @@ def __array_wrap__(self, result: np.ndarray, # we try to cast extension array types back to the original if is_extension_array_dtype(self): - result = result.astype(self.dtype, - copy=False, - errors='ignore', - casting='same_kind') + result = result.astype(self.dtype, copy=False) return result.__finalize__(self) diff --git a/pandas/tests/sparse/frame/test_analytics.py b/pandas/tests/sparse/frame/test_analytics.py index b187976703447..a6d2225377dc3 100644 --- a/pandas/tests/sparse/frame/test_analytics.py +++ b/pandas/tests/sparse/frame/test_analytics.py @@ -42,14 +42,18 @@ def test_quantile_multi(): tm.assert_sp_frame_equal(result, sparse_expected) +@pytest.mark.parametrize( + 'data, dtype', + [([1, np.nan, 3], SparseDtype('float64', np.nan)), + ([1, 2, 3], SparseDtype('int'))]) @pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=str) -def test_ufunc(func): +def test_ufunc(data, dtype, func): # GH 23743 # assert we preserve the incoming dtype on ufunc operation df = DataFrame( - {'A': Series([1, np.nan, 3], dtype=SparseDtype('float64', np.nan))}) + {'A': Series(data, dtype=dtype)}) result = func(df) expected = DataFrame( - {'A': Series(func([1, np.nan, 3]), - dtype=SparseDtype('float64', np.nan))}) + {'A': Series(func(data), + dtype=dtype)}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/sparse/series/test_analytics.py b/pandas/tests/sparse/series/test_analytics.py index bac778f09fe66..97a86a7dec94d 100644 --- a/pandas/tests/sparse/series/test_analytics.py +++ b/pandas/tests/sparse/series/test_analytics.py @@ -5,12 +5,16 @@ from pandas.util import testing as tm +@pytest.mark.parametrize( + 'data, dtype', + [([1, np.nan, 3], SparseDtype('float64', np.nan)), + ([1, 2, 3], SparseDtype('int'))]) @pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=str) -def test_ufunc(func): +def test_ufunc(data, dtype, func): # GH 23743 # assert we preserve the incoming dtype on ufunc operation - s = Series([1, np.nan, 3], dtype=SparseDtype('float64', np.nan)) + s = Series(data, dtype=dtype) result = func(s) - expected = Series(func([1, np.nan, 3]), - dtype=SparseDtype('float64', np.nan)) + expected = Series(func(data), + dtype=dtype) tm.assert_series_equal(result, expected) From 0662f2b1007cafb9af5837fa61198ef7bd4631a1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 2 Jun 2019 19:18:23 -0400 Subject: [PATCH 07/10] infer types --- pandas/core/arrays/sparse.py | 19 +++++++++---------- pandas/core/internals/construction.py | 5 ++++- pandas/core/sparse/frame.py | 14 ++++++++++---- pandas/tests/sparse/frame/test_analytics.py | 2 +- pandas/tests/sparse/series/test_analytics.py | 2 +- 5 files changed, 25 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index dadbd5e23dce9..68fd58e63ba02 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1927,18 +1927,17 @@ def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): index = _make_index(length, indices, kind) sparsified_values = arr[mask] - # careful about casting here - # as we could easily specify a type that cannot hold the resulting values - # e.g. integer when we have floats + # careful about casting here as we could easily specify a type that + # cannot hold the resulting values, e.g. integer when we have floats + # if we don't have an object specified then use this as the cast if dtype is not None: - try: - sparsified_values = astype_nansafe( - sparsified_values, dtype=dtype, casting='same_kind') - except TypeError: - dtype = 'float64' - sparsified_values = astype_nansafe( - sparsified_values, dtype=dtype, casting='unsafe') + ok_to_cast = all(not (is_object_dtype(t) or is_bool_dtype(t)) + for t in (dtype, sparsified_values.dtype)) + if ok_to_cast: + dtype = find_common_type([dtype, sparsified_values.dtype]) + sparsified_values = astype_nansafe( + sparsified_values, dtype=dtype) # TODO: copy return sparsified_values, index, fill_value diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2616f0aa97d0d..8e1609c1364fd 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -666,7 +666,10 @@ def sanitize_array(data, index, dtype=None, copy=False, data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) - if is_object_dtype(subarr.dtype) and dtype != 'object': + if (not (is_extension_array_dtype(subarr.dtype) or + is_extension_array_dtype(dtype)) and + is_object_dtype(subarr.dtype) and + not is_object_dtype(dtype)): inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 67ecbcbea67f9..778fff249817d 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -284,20 +284,26 @@ def _unpickle_sparse_frame_compat(self, state): def to_dense(self): return SparseFrameAccessor(self).to_dense() - def _apply_columns(self, func): + def _apply_columns(self, func, *args, **kwargs): """ Get new SparseDataFrame applying func to each columns """ - new_data = {col: func(series) + new_data = {col: func(series, *args, **kwargs) for col, series in self.items()} return self._constructor( data=new_data, index=self.index, columns=self.columns, default_fill_value=self.default_fill_value).__finalize__(self) - def astype(self, dtype): - return self._apply_columns(lambda x: x.astype(dtype)) + def astype(self, dtype, **kwargs): + + def f(x, dtype, **kwargs): + if isinstance(dtype, (dict, Series)): + dtype = dtype[x.name] + return x.astype(dtype, **kwargs) + + return self._apply_columns(f, dtype=dtype, **kwargs) def copy(self, deep=True): """ diff --git a/pandas/tests/sparse/frame/test_analytics.py b/pandas/tests/sparse/frame/test_analytics.py index a6d2225377dc3..52fcf7c355cf2 100644 --- a/pandas/tests/sparse/frame/test_analytics.py +++ b/pandas/tests/sparse/frame/test_analytics.py @@ -55,5 +55,5 @@ def test_ufunc(data, dtype, func): result = func(df) expected = DataFrame( {'A': Series(func(data), - dtype=dtype)}) + dtype=SparseDtype('float64', dtype.fill_value))}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/sparse/series/test_analytics.py b/pandas/tests/sparse/series/test_analytics.py index 97a86a7dec94d..bf04f5b52a371 100644 --- a/pandas/tests/sparse/series/test_analytics.py +++ b/pandas/tests/sparse/series/test_analytics.py @@ -16,5 +16,5 @@ def test_ufunc(data, dtype, func): s = Series(data, dtype=dtype) result = func(s) expected = Series(func(data), - dtype=dtype) + dtype=SparseDtype('float64', dtype.fill_value)) tm.assert_series_equal(result, expected) From c75461c191a1e3d90d3af200c50189dffb3c6fa4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 8 Jun 2019 19:31:15 -0400 Subject: [PATCH 08/10] sparse masking --- pandas/tests/extension/test_sparse.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index faf1905ea1763..b259318371c01 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -154,6 +154,32 @@ def test_reindex(self, data, na_value): self._check_unsupported(data) super().test_reindex(data, na_value) + def test_getitem_mask(self, data): + # Empty mask, raw array + mask = np.zeros(len(data), dtype=bool) + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + # Empty mask, in series + mask = np.zeros(len(data), dtype=bool) + result = pd.Series(data)[mask] + assert len(result) == 0 + + # we change int -> float because of the masking + assert result.dtype == SparseDtype('float64', data.dtype.fill_value) + + # non-empty mask, raw array + mask[0] = True + result = data[mask] + assert len(result) == 1 + assert isinstance(result, type(data)) + + # non-empty mask, in series + result = pd.Series(data)[mask] + assert len(result) == 1 + assert result.dtype == data.dtype + # Skipping TestSetitem, since we don't implement it. From 86090bf6b5e02fd55e063aaf39d44e53354ab284 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 9 Jun 2019 19:12:28 -0400 Subject: [PATCH 09/10] fix float casting --- pandas/core/arrays/sparse.py | 24 +++++++++++++++------- pandas/tests/arrays/sparse/test_array.py | 1 + pandas/tests/extension/test_sparse.py | 26 ------------------------ 3 files changed, 18 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 68fd58e63ba02..490df5b250f74 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -25,7 +25,8 @@ infer_dtype_from_scalar) from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, - is_integer, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype) + is_float_dtype, is_integer, is_integer_dtype, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ( ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries) @@ -1927,15 +1928,24 @@ def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): index = _make_index(length, indices, kind) sparsified_values = arr[mask] - # careful about casting here as we could easily specify a type that - # cannot hold the resulting values, e.g. integer when we have floats - # if we don't have an object specified then use this as the cast if dtype is not None: - ok_to_cast = all(not (is_object_dtype(t) or is_bool_dtype(t)) - for t in (dtype, sparsified_values.dtype)) - if ok_to_cast: + # careful about casting here as we could easily specify a type that + # cannot hold the resulting values, e.g. integer when we have floats + # if this is not safe then convert the dtype; note that if there are + # nan's in the source array this will raise + + # TODO: ideally this would be done by 'safe' casting in astype_nansafe + # but alas too many cases rely upon this working in the current way + # and casting='safe' doesn't really work in numpy properly + if is_integer_dtype(dtype) and is_float_dtype(sparsified_values.dtype): + result = astype_nansafe( + sparsified_values, dtype=dtype) + if np.allclose(result, sparsified_values, rtol=0): + return result, index, fill_value + dtype = find_common_type([dtype, sparsified_values.dtype]) + sparsified_values = astype_nansafe( sparsified_values, dtype=dtype) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 231b5a92dbb3a..69259c66d61dd 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -472,6 +472,7 @@ def test_astype(self): # float -> float arr = SparseArray([None, None, 0, 2]) result = arr.astype("Sparse[float32]") + expected = SparseArray([None, None, 0, 2], dtype=np.dtype('float32')) tm.assert_sp_array_equal(result, expected) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index b259318371c01..faf1905ea1763 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -154,32 +154,6 @@ def test_reindex(self, data, na_value): self._check_unsupported(data) super().test_reindex(data, na_value) - def test_getitem_mask(self, data): - # Empty mask, raw array - mask = np.zeros(len(data), dtype=bool) - result = data[mask] - assert len(result) == 0 - assert isinstance(result, type(data)) - - # Empty mask, in series - mask = np.zeros(len(data), dtype=bool) - result = pd.Series(data)[mask] - assert len(result) == 0 - - # we change int -> float because of the masking - assert result.dtype == SparseDtype('float64', data.dtype.fill_value) - - # non-empty mask, raw array - mask[0] = True - result = data[mask] - assert len(result) == 1 - assert isinstance(result, type(data)) - - # non-empty mask, in series - result = pd.Series(data)[mask] - assert len(result) == 1 - assert result.dtype == data.dtype - # Skipping TestSetitem, since we don't implement it. From 4bd486eb718d416a47aac1deb7907012e64da860 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 9 Jun 2019 20:36:50 -0400 Subject: [PATCH 10/10] review comments --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/groupby/groupby.py | 35 +++++++++++++++------------ pandas/core/internals/blocks.py | 8 ++++++ pandas/tests/groupby/test_function.py | 3 +-- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 6dacb9dae834a..2e4959b44aeeb 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -323,7 +323,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t ufuncs on Extension Dtype ^^^^^^^^^^^^^^^^^^^^^^^^^ -Operations with ``numpy`` ufuncs on Extension Arrays, including Sparse Dtypes will now preserve the +Operations with ``numpy`` ufuncs on DataFrames with Extension Arrays, including Sparse Dtypes will now preserve the resulting dtypes to same as the input dtype; previously this would coerce to a dense dtype. (:issue:`23743`) .. ipython:: python diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6274f45427db6..e067185e7ce94 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1299,7 +1299,7 @@ def _add_numeric_operations(cls): """ def groupby_function(name, alias, npfunc, - numeric_only=True, _convert=False, + numeric_only=True, min_count=-1): _local_template = """ @@ -1321,27 +1321,30 @@ def f(self, **kwargs): kwargs['min_count'] = min_count self._set_group_selection() + + # try a cython aggregation if we can try: return self._cython_agg_general( alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) except Exception: - result = self.aggregate( - lambda x: npfunc(x, axis=self.axis)) - - # coerce the columns if we can - if isinstance(result, DataFrame): - for col in result.columns: - result[col] = self._try_cast( - result[col], self.obj[col]) - else: - result = self._try_cast( - result, self.obj) - - if _convert: - result = result._convert(datetime=True) - return result + pass + + # apply a non-cython aggregation + result = self.aggregate( + lambda x: npfunc(x, axis=self.axis)) + + # coerce the resulting columns if we can + if isinstance(result, DataFrame): + for col in result.columns: + result[col] = self._try_cast( + result[col], self.obj[col]) + else: + result = self._try_cast( + result, self.obj) + + return result set_function_name(f, name, cls) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 429b2b064c702..dfb5c458b0d77 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1772,8 +1772,16 @@ def _try_cast_result(self, result, dtype=None): """ if we have an operation that operates on for example floats we want to try to cast back to our EA here if possible + + result could be a 2-D numpy array, e.g. the result of + a numeric operation; but it must be shape (1, X) because + we by-definition operate on the ExtensionBlocks one-by-one + + result could also be an EA Array itself, in which case it + is already a 1-D array """ try: + result = self._holder._from_sequence( np.asarray(result).ravel(), dtype=dtype) except Exception: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 2a77b1b96a662..355da1151d878 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -479,8 +479,7 @@ def scipy_sem(*args, **kwargs): ('last', lambda x: x.iloc[-1]), ('count', np.size), pytest.param( - 'sem', scipy_sem, marks=[pytest.mark.skipif( - td._skip_if_no_scipy(), reason='scipy not installed')])]) + 'sem', scipy_sem, marks=td.skip_if_no_scipy)]) def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float)