From cdd1a85abb5523eff9cda38b6bb828084202f5f6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 24 Jul 2023 13:25:48 -0700 Subject: [PATCH 1/3] ENH: Implement interchange protocol for DatetimeTZDtype --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/interchange/column.py | 13 +++++++++++-- pandas/core/interchange/from_dataframe.py | 8 ++++---- pandas/core/interchange/utils.py | 9 ++++++--- pandas/tests/interchange/test_impl.py | 11 +++++++++++ 5 files changed, 33 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 65f5328189eeb..f2e015d0b72e7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -148,6 +148,7 @@ Other enhancements - Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`) - Classes that are useful for type-hinting have been added to the public API in the new submodule ``pandas.api.typing`` (:issue:`48577`) - Implemented :attr:`Series.dt.is_month_start`, :attr:`Series.dt.is_month_end`, :attr:`Series.dt.is_year_start`, :attr:`Series.dt.is_year_end`, :attr:`Series.dt.is_quarter_start`, :attr:`Series.dt.is_quarter_end`, :attr:`Series.dt.is_days_in_month`, :attr:`Series.dt.unit`, :meth:`Series.dt.is_normalize`, :meth:`Series.dt.day_name`, :meth:`Series.dt.month_name`, :meth:`Series.dt.tz_convert` for :class:`ArrowDtype` with ``pyarrow.timestamp`` (:issue:`52388`, :issue:`51718`) +- Implemented :func:`api.interchange.from_dataframe` for :class:`DatetimeTZDtype` (:issue:`54239`) - Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype`. (:issue:`52201`) - Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide ` (:issue:`48347`) - Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index ff4ff487e23ea..5fc2a821d5dbd 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -9,7 +9,10 @@ from pandas.errors import NoBufferPresent from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.dtypes import ArrowDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + DatetimeTZDtype, +) import pandas as pd from pandas.api.types import is_string_dtype @@ -138,6 +141,8 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: raise ValueError(f"Data type {dtype} not supported by interchange protocol") if isinstance(dtype, ArrowDtype): byteorder = dtype.numpy_dtype.byteorder + elif isinstance(dtype, DatetimeTZDtype): + byteorder = dtype.base.byteorder else: byteorder = dtype.byteorder @@ -269,7 +274,11 @@ def _get_data_buffer( DtypeKind.BOOL, DtypeKind.DATETIME, ): - buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy) + if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: + np_arr = self._col.dt.tz_convert(None).to_numpy() + else: + np_arr = self._col.to_numpy() + buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: codes = self._col.values._codes diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 0fe92f9b1be50..def6d4bac82b5 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -325,20 +325,20 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: return np.asarray(str_list, dtype="object"), buffers -def parse_datetime_format_str(format_str, data): +def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: """Parse datetime `format_str` to interpret the `data`.""" # timestamp 'ts{unit}:tz' timestamp_meta = re.match(r"ts([smun]):(.*)", format_str) if timestamp_meta: unit, tz = timestamp_meta.group(1), timestamp_meta.group(2) - if tz != "": - raise NotImplementedError("Timezones are not supported yet") if unit != "s": # the format string describes only a first letter of the unit, so # add one extra letter to convert the unit to numpy-style: # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' unit += "s" data = data.astype(f"datetime64[{unit}]") + if tz != "": + data = pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(tz) return data # date 'td{Days/Ms}' @@ -358,7 +358,7 @@ def parse_datetime_format_str(format_str, data): raise NotImplementedError(f"DateTime kind is not supported: {format_str}") -def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: +def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any]: """ Convert a column holding DateTime data to a NumPy array. diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 46103d0d9e8f1..4ac063080e62d 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -4,7 +4,6 @@ from __future__ import annotations -import re import typing import numpy as np @@ -14,6 +13,7 @@ from pandas.core.dtypes.dtypes import ( ArrowDtype, CategoricalDtype, + DatetimeTZDtype, ) if typing.TYPE_CHECKING: @@ -134,10 +134,13 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: if lib.is_np_dtype(dtype, "M"): # Selecting the first char of resolution string: - # dtype.str -> ' ' 'n' + resolution = np.datetime_data(dtype)[0][0] return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="") + elif isinstance(dtype, DatetimeTZDtype): + return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz) + raise NotImplementedError( f"Conversion of {dtype} to Arrow C format string is not implemented." ) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 5fce4f162d71f..bfb0eceaa0ca1 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -284,3 +284,14 @@ def test_empty_pyarrow(data): arrow_df = pa_from_dataframe(expected) result = from_dataframe(arrow_df) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("tz", ["UTC", "US/Pacific"]) +@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) +def test_datetimetzdtype(tz, unit): + # GH 54239 + tz_data = ( + pd.date_range("2018-01-01", periods=5, freq="D").tz_localize(tz).as_unit(unit) + ) + df = pd.DataFrame({"ts_tz": tz_data}) + tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) From bb0b2f0c285ec059211e054b76852677fd922df8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 24 Jul 2023 17:23:36 -0700 Subject: [PATCH 2/3] Add type ignores --- pandas/core/interchange/column.py | 2 +- pandas/core/interchange/from_dataframe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 5fc2a821d5dbd..c49f8d58cf073 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -142,7 +142,7 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: if isinstance(dtype, ArrowDtype): byteorder = dtype.numpy_dtype.byteorder elif isinstance(dtype, DatetimeTZDtype): - byteorder = dtype.base.byteorder + byteorder = dtype.base.byteorder # type: ignore[union-attr] else: byteorder = dtype.byteorder diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index def6d4bac82b5..d3aece6e63798 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -389,7 +389,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any length=col.size(), ) - data = parse_datetime_format_str(format_str, data) + data = parse_datetime_format_str(format_str, data) # type: ignore[assignment] data = set_nulls(data, col, buffers["validity"]) return data, buffers From b8b6005592b1b1b00145914a9d965b7f2abb589d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Jul 2023 10:44:47 -0700 Subject: [PATCH 3/3] Add comment --- pandas/core/interchange/column.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index c49f8d58cf073..acfbc5d9e6c62 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -274,6 +274,8 @@ def _get_data_buffer( DtypeKind.BOOL, DtypeKind.DATETIME, ): + # self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make + # it longer than 4 characters if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: np_arr = self._col.dt.tz_convert(None).to_numpy() else: