Skip to content

Backport PR #56650 on branch 2.2.x (ENH: Implement dt methods for pyarrow duration types) #56656

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ Other enhancements
- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`)
- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`)
- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`)
- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`)
Expand Down
87 changes: 87 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from pandas._libs import lib
from pandas._libs.tslibs import (
NaT,
Timedelta,
Timestamp,
timezones,
Expand Down Expand Up @@ -2498,6 +2499,92 @@ def _str_wrap(self, width: int, **kwargs):
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))

@property
def _dt_days(self):
return type(self)(
pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32())
)

@property
def _dt_hours(self):
return type(self)(
pa.array(
[
td.components.hours if td is not NaT else None
for td in self._to_timedeltaarray()
],
type=pa.int32(),
)
)

@property
def _dt_minutes(self):
return type(self)(
pa.array(
[
td.components.minutes if td is not NaT else None
for td in self._to_timedeltaarray()
],
type=pa.int32(),
)
)

@property
def _dt_seconds(self):
return type(self)(
pa.array(
self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32()
)
)

@property
def _dt_milliseconds(self):
return type(self)(
pa.array(
[
td.components.milliseconds if td is not NaT else None
for td in self._to_timedeltaarray()
],
type=pa.int32(),
)
)

@property
def _dt_microseconds(self):
return type(self)(
pa.array(
self._to_timedeltaarray().microseconds,
from_pandas=True,
type=pa.int32(),
)
)

@property
def _dt_nanoseconds(self):
return type(self)(
pa.array(
self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32()
)
)

def _dt_to_pytimedelta(self):
data = self._pa_array.to_pylist()
if self._dtype.pyarrow_dtype.unit == "ns":
data = [None if ts is None else ts.to_pytimedelta() for ts in data]
return np.array(data, dtype=object)

def _dt_total_seconds(self):
return type(self)(
pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True)
)

def _dt_as_unit(self, unit: str):
if pa.types.is_date(self.dtype.pyarrow_dtype):
raise NotImplementedError("as_unit not implemented for date types")
pd_array = self._maybe_convert_datelike_array()
# Don't just cast _pa_array in order to follow pandas unit conversion rules
return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True))

@property
def _dt_year(self):
return type(self)(pc.year(self._pa_array))
Expand Down
39 changes: 38 additions & 1 deletion pandas/core/indexes/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,20 @@ def _delegate_method(self, name: str, *args, **kwargs):
return result


@delegate_names(
delegate=ArrowExtensionArray,
accessors=TimedeltaArray._datetimelike_ops,
typ="property",
accessor_mapping=lambda x: f"_dt_{x}",
raise_on_missing=False,
)
@delegate_names(
delegate=ArrowExtensionArray,
accessors=TimedeltaArray._datetimelike_methods,
typ="method",
accessor_mapping=lambda x: f"_dt_{x}",
raise_on_missing=False,
)
@delegate_names(
delegate=ArrowExtensionArray,
accessors=DatetimeArray._datetimelike_ops,
Expand Down Expand Up @@ -213,6 +227,9 @@ def _delegate_method(self, name: str, *args, **kwargs):

return result

def to_pytimedelta(self):
return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta()

def to_pydatetime(self):
# GH#20306
warnings.warn(
Expand Down Expand Up @@ -241,6 +258,26 @@ def isocalendar(self) -> DataFrame:
)
return iso_calendar_df

@property
def components(self) -> DataFrame:
from pandas import DataFrame

components_df = DataFrame(
{
col: getattr(self._parent.array, f"_dt_{col}")
for col in [
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"nanoseconds",
]
}
)
return components_df


@delegate_names(
delegate=DatetimeArray,
Expand Down Expand Up @@ -592,7 +629,7 @@ def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor
index=orig.index,
)

if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M":
if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm":
return ArrowTemporalProperties(data, orig)
if lib.is_np_dtype(data.dtype, "M"):
return DatetimeProperties(data, orig)
Expand Down
105 changes: 105 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2723,6 +2723,111 @@ def test_dt_tz_convert(unit):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"])
def test_as_unit(dtype):
# GH 52284
ser = pd.Series([1000, None], dtype=dtype)
result = ser.dt.as_unit("ns")
expected = ser.astype(dtype.replace("ms", "ns"))
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"prop, expected",
[
["days", 1],
["seconds", 2],
["microseconds", 3],
["nanoseconds", 4],
],
)
def test_dt_timedelta_properties(prop, expected):
# GH 52284
ser = pd.Series(
[
pd.Timedelta(
days=1,
seconds=2,
microseconds=3,
nanoseconds=4,
),
None,
],
dtype=ArrowDtype(pa.duration("ns")),
)
result = getattr(ser.dt, prop)
expected = pd.Series(
ArrowExtensionArray(pa.array([expected, None], type=pa.int32()))
)
tm.assert_series_equal(result, expected)


def test_dt_timedelta_total_seconds():
# GH 52284
ser = pd.Series(
[
pd.Timedelta(
days=1,
seconds=2,
microseconds=3,
nanoseconds=4,
),
None,
],
dtype=ArrowDtype(pa.duration("ns")),
)
result = ser.dt.total_seconds()
expected = pd.Series(
ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64()))
)
tm.assert_series_equal(result, expected)


def test_dt_to_pytimedelta():
# GH 52284
data = [timedelta(1, 2, 3), timedelta(1, 2, 4)]
ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns")))

result = ser.dt.to_pytimedelta()
expected = np.array(data, dtype=object)
tm.assert_numpy_array_equal(result, expected)
assert all(type(res) is timedelta for res in result)

expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta()
tm.assert_numpy_array_equal(result, expected)


def test_dt_components():
# GH 52284
ser = pd.Series(
[
pd.Timedelta(
days=1,
seconds=2,
microseconds=3,
nanoseconds=4,
),
None,
],
dtype=ArrowDtype(pa.duration("ns")),
)
result = ser.dt.components
expected = pd.DataFrame(
[[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]],
columns=[
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"nanoseconds",
],
dtype="int32[pyarrow]",
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("skipna", [True, False])
def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna):
# GH51624
Expand Down