-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
WIP: implement reductions for DatetimeArray/TimedeltaArray/PeriodArray #23890
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3c3c156
e8a5e8b
167989c
febaf67
e3fce02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds | ||
from pandas._libs.tslibs.timestamps import maybe_integer_op_deprecated | ||
import pandas.compat as compat | ||
from pandas.compat.numpy import function as nv | ||
from pandas.errors import ( | ||
AbstractMethodError, NullFrequencyError, PerformanceWarning) | ||
from pandas.util._decorators import deprecate_kwarg | ||
|
@@ -27,13 +28,55 @@ | |
|
||
from pandas.core.algorithms import checked_add_with_arr, take, unique1d | ||
import pandas.core.common as com | ||
from pandas.core.nanops import nanstd | ||
|
||
from pandas.tseries import frequencies | ||
from pandas.tseries.offsets import DateOffset, Tick | ||
|
||
from .base import ExtensionOpsMixin | ||
|
||
|
||
def _get_reduction_vals(obj, skipna): | ||
if not len(obj): | ||
return NaT | ||
|
||
if obj.hasnans: | ||
if not skipna: | ||
return NaT | ||
vals = obj.asi8[~obj._isnan] | ||
else: | ||
vals = obj.asi8 | ||
return vals | ||
|
||
|
||
def _make_reduction(op, diff=False, only_timedelta=False): | ||
""" | ||
Make a unary reduction method that handles NaT appropriately. | ||
""" | ||
|
||
def method(self, skipna=True, **kwargs): | ||
if only_timedelta: | ||
raise TypeError('"{meth}" reduction is not valid for {cls}' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. woops, that should be |
||
.format(meth=op.__name__, cls=type(self).__name__)) | ||
|
||
vals = _get_reduction_vals(self, skipna) | ||
if vals is NaT: | ||
return NaT | ||
|
||
# Try to minimize floating point error by rounding before casting | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does DatetimeIndex do this casting? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only reductions DatetimeIndex has is min and max, for which it is not relevant. |
||
# to int64 | ||
result = op(vals, **kwargs) | ||
result = np.float64(result).round() | ||
result = np.int64(result) | ||
if diff: | ||
return self._box_func(result) - self._box_func(0) | ||
return self._box_func(result) | ||
|
||
method.__name__ = op.__name__ | ||
# TODO: __doc__ | ||
return method | ||
|
||
|
||
def _make_comparison_op(cls, op): | ||
# TODO: share code with indexes.base version? Main difference is that | ||
# the block for MultiIndex was removed here. | ||
|
@@ -364,6 +407,19 @@ def _validate_frequency(cls, index, freq, **kwargs): | |
'does not conform to passed frequency {passed}' | ||
.format(infer=inferred, passed=freq.freqstr)) | ||
|
||
# ---------------------------------------------------------------- | ||
# Reductions | ||
|
||
min = _make_reduction(np.min) | ||
max = _make_reduction(np.max) | ||
|
||
mean = _make_reduction(np.mean) | ||
median = _make_reduction(np.median) | ||
std = _make_reduction(nanstd, diff=True) | ||
|
||
sum = _make_reduction(np.sum, only_timedelta=True) | ||
# cumsum = _make_reduction(np.cumsum, only_timedelta=True) | ||
|
||
# ------------------------------------------------------------------ | ||
# Arithmetic Methods | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -795,7 +795,7 @@ def _ndarray_values(self): | |
def empty(self): | ||
return not self.size | ||
|
||
def max(self): | ||
def max(self, skipna=True, axis=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In Series, these are all |
||
""" | ||
Return the maximum value of the Index. | ||
|
||
|
@@ -826,19 +826,21 @@ def max(self): | |
>>> idx.max() | ||
('b', 2) | ||
""" | ||
nv.validate_minmax_axis(axis) | ||
return nanops.nanmax(self.values) | ||
|
||
def argmax(self, axis=None): | ||
def argmax(self, skipna=True, axis=None): | ||
""" | ||
Return a ndarray of the maximum argument indexer. | ||
|
||
See Also | ||
-------- | ||
numpy.ndarray.argmax | ||
""" | ||
nv.validate_minmax_axis(axis) | ||
return nanops.nanargmax(self.values) | ||
|
||
def min(self): | ||
def min(self, skipna=True, axis=None): | ||
""" | ||
Return the minimum value of the Index. | ||
|
||
|
@@ -869,16 +871,18 @@ def min(self): | |
>>> idx.min() | ||
('a', 1) | ||
""" | ||
nv.validate_minmax_axis(axis) | ||
return nanops.nanmin(self.values) | ||
|
||
def argmin(self, axis=None): | ||
def argmin(self, skipna=True, axis=None): | ||
""" | ||
Return a ndarray of the minimum argument indexer. | ||
|
||
See Also | ||
-------- | ||
numpy.ndarray.argmin | ||
""" | ||
nv.validate_minmax_axis(axis) | ||
return nanops.nanargmin(self.values) | ||
|
||
def tolist(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,8 +12,8 @@ | |
from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask | ||
from pandas.core.dtypes.common import ( | ||
_get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype, | ||
is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_float, | ||
is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype, | ||
is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, | ||
is_float, is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype, | ||
is_object_dtype, is_scalar, is_timedelta64_dtype) | ||
from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna | ||
|
||
|
@@ -426,7 +426,6 @@ def nansum(values, axis=None, skipna=True, min_count=0, mask=None): | |
return _wrap_results(the_sum, dtype) | ||
|
||
|
||
@disallow('M8') | ||
@bottleneck_switch() | ||
def nanmean(values, axis=None, skipna=True, mask=None): | ||
""" | ||
|
@@ -462,6 +461,14 @@ def nanmean(values, axis=None, skipna=True, mask=None): | |
elif is_float_dtype(dtype): | ||
dtype_sum = dtype | ||
dtype_count = dtype | ||
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yah, but I'm kind of hoping to get rid of that |
||
from pandas import DatetimeIndex | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does this need to be boxed in an index? Shouldn't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know this module especially well, but my assumption is that it could be a numpy array at this point. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that Ideally There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose the exception will be |
||
masked_vals = values | ||
if mask is not None: | ||
masked_vals = values[~mask] | ||
the_mean = DatetimeIndex(masked_vals).mean(skipna=skipna) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls try to follow the patterns in this module again i see lots of reinventing the wheel There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nothing in this PR has been updated since previous conversation. I’ll get around to this suggestion before too long. |
||
return the_mean | ||
|
||
count = _get_counts(mask, axis, dtype=dtype_count) | ||
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) | ||
|
||
|
@@ -476,7 +483,6 @@ def nanmean(values, axis=None, skipna=True, mask=None): | |
return _wrap_results(the_mean, dtype) | ||
|
||
|
||
@disallow('M8') | ||
@bottleneck_switch() | ||
def nanmedian(values, axis=None, skipna=True, mask=None): | ||
""" | ||
|
@@ -508,6 +514,14 @@ def get_median(x): | |
return np.nanmedian(x[mask]) | ||
|
||
values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask) | ||
|
||
if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): | ||
from pandas import DatetimeIndex | ||
masked_vals = values | ||
if mask is not None: | ||
masked_vals = values[~mask] | ||
return DatetimeIndex(masked_vals).median(skipna=skipna) | ||
|
||
if not is_float_dtype(values): | ||
values = values.astype('f8') | ||
values[mask] = np.nan | ||
|
@@ -561,7 +575,6 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): | |
return count, d | ||
|
||
|
||
@disallow('M8') | ||
@bottleneck_switch(ddof=1) | ||
def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): | ||
""" | ||
|
@@ -591,6 +604,14 @@ def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): | |
>>> nanops.nanstd(s) | ||
1.0 | ||
""" | ||
if is_datetime64_dtype(values) or is_datetime64tz_dtype(values): | ||
from pandas import DatetimeIndex | ||
masked_vals = values | ||
if mask is not None: | ||
masked_vals = values[~mask] | ||
return DatetimeIndex(masked_vals).std(skipna=skipna) | ||
# TODO: adjust by ddof? | ||
|
||
result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, | ||
mask=mask)) | ||
return _wrap_results(result, values.dtype) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you seem to be reinventing the wheel here. we already do all of this in nanops.py for timedelta. I am not sure how this should be integrated here, but this is not the way. (meaning re-write all of the code we already have)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this is definitely a Proof Of Concept. For now the main question for you is if you're on board with the idea of bringing the Index reduction signatures in line with everything else
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes about the signatures generally. but maybe let's start with min/max as more straightforward? To do what you are suggestnig here will require using nanops (the interface can certainly be here), but the implementation is already there (in nanops).