From cae4400e657ce161692b1de28eef83aa06d2c1ff Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Tue, 7 Nov 2017 22:56:49 -0500 Subject: [PATCH 1/9] #18058: improve DatetimeIndex.date performance --- pandas/_libs/tslib.pyx | 33 ++++++++++++++++++++------------ pandas/core/indexes/datetimes.py | 13 +++++++++++-- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 5e3eb1f00b18c..f380c1a720c17 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -23,7 +23,7 @@ cimport util from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyDateTime_IMPORT, - timedelta, datetime) + timedelta, datetime, date) # import datetime C API PyDateTime_IMPORT # this is our datetime.pxd @@ -80,10 +80,16 @@ cdef inline object create_datetime_from_ts( return datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) +cdef inline object create_date_from_ts( + int64_t value, pandas_datetimestruct dts, + object tz, object freq): + """ convenience routine to construct a datetime.date from its parts """ + return date(dts.year, dts.month, dts.day) -def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): - # convert an i8 repr to an ndarray of datetimes or Timestamp (if box == - # True) +def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False, + kind="datetime"): + # convert an i8 repr to an ndarray of datetimes, Timestamp (if box == + # True) or dates (if kind="date") cdef: Py_ssize_t i, n = len(arr) @@ -94,16 +100,19 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, pandas_datetimestruct, object, object) - if box and is_string_object(freq): - from pandas.tseries.frequencies import to_offset - freq = to_offset(freq) - - if box: - func_create = create_timestamp_from_ts + if kind == "date": + func_create = create_date_from_ts else: - func_create = create_datetime_from_ts + if box and is_string_object(freq): + from pandas.tseries.frequencies import to_offset + freq = to_offset(freq) + + if box: + func_create = create_timestamp_from_ts + else: + func_create = create_datetime_from_ts - if tz is not None: + if tz is not None and kind != "date": if is_utc(tz): for i in range(n): value = arr[i] diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index a2ed2ff9bce5e..318199dbc1927 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -976,6 +976,16 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.values.copy() + def to_pydate(self): + """ + Return DateTimeIndex as object ndarray of datetime.date objects + + Returns + ------- + dates : ndarray + """ + return libts.ints_to_pydatetime(self.asi8, kind="date") + def to_pydatetime(self): """ Return DatetimeIndex as object ndarray of datetime.datetime objects @@ -1687,8 +1697,7 @@ def date(self): Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information). """ - return self._maybe_mask_results(libalgos.arrmap_object( - self.asobject.values, lambda x: x.date())) + return self.normalize().to_pydate() def normalize(self): """ From 1ec7216d31358980db2b75a2c8bddaa867ca16ff Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Tue, 7 Nov 2017 23:46:01 -0500 Subject: [PATCH 2/9] Fix PEP8 issue --- pandas/_libs/tslib.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index f380c1a720c17..df24679ae74c7 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -86,6 +86,7 @@ cdef inline object create_date_from_ts( """ convenience routine to construct a datetime.date from its parts """ return date(dts.year, dts.month, dts.day) + def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False, kind="datetime"): # convert an i8 repr to an ndarray of datetimes, Timestamp (if box == From b0ca4ee97027af81e3b4942306973b5a940c51ac Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Thu, 9 Nov 2017 22:11:06 -0500 Subject: [PATCH 3/9] Modify argument 'box' of ints_to_pydatetime --- pandas/_libs/tslib.pyx | 28 +++++++++++++++------------- pandas/core/dtypes/concat.py | 4 ++-- pandas/core/indexes/datetimes.py | 4 ++-- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index df24679ae74c7..566bff96d1760 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -87,10 +87,13 @@ cdef inline object create_date_from_ts( return date(dts.year, dts.month, dts.day) -def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False, - kind="datetime"): - # convert an i8 repr to an ndarray of datetimes, Timestamp (if box == - # True) or dates (if kind="date") +def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, + box="datetime"): + # convert an i8 repr to an ndarray of datetimes (if box == "datetime"), + # Timestamp (if box == "timestamp") or dates (if box == "date") + + assert ((box == "datetime") or (box == "date") or (box == "timestamp")), \ + "box must be one of 'datetime', 'date' or 'timestamp'" cdef: Py_ssize_t i, n = len(arr) @@ -101,19 +104,18 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False, ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, pandas_datetimestruct, object, object) - if kind == "date": + if box == "date": func_create = create_date_from_ts - else: - if box and is_string_object(freq): + elif box == "timestamp": + func_create = create_timestamp_from_ts + + if is_string_object(freq): from pandas.tseries.frequencies import to_offset freq = to_offset(freq) + elif box == "datetime": + func_create = create_datetime_from_ts - if box: - func_create = create_timestamp_from_ts - else: - func_create = create_datetime_from_ts - - if tz is not None and kind != "date": + if tz is not None and box != "date": if is_utc(tz): for i in range(n): value = arr[i] diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index fe306b51de8d0..085c45abdd483 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -405,12 +405,12 @@ def convert_to_pydatetime(x, axis): else: shape = x.shape x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), - box=True) + box="timestamp") x = x.reshape(shape) elif x.dtype == _TD_DTYPE: shape = x.shape - x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) + x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box="True") x = x.reshape(shape) if axis == 1: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 318199dbc1927..d4d0067e598dc 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -984,7 +984,7 @@ def to_pydate(self): ------- dates : ndarray """ - return libts.ints_to_pydatetime(self.asi8, kind="date") + return libts.ints_to_pydatetime(self.asi8, box="date") def to_pydatetime(self): """ @@ -1247,7 +1247,7 @@ def __iter__(self): end_i = min((i + 1) * chunksize, length) converted = libts.ints_to_pydatetime(data[start_i:end_i], tz=self.tz, freq=self.freq, - box=True) + box="timestamp") for v in converted: yield v From 244c98300c59aed1d4cedc7bd94464debca3df9a Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Sat, 11 Nov 2017 01:27:54 -0500 Subject: [PATCH 4/9] Add comments to ints_to_pydatetime and fix typo in concat.py --- pandas/_libs/tslib.pyx | 27 +++++++++++++++++++++++---- pandas/core/dtypes/concat.py | 2 +- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 566bff96d1760..a2b4dac5b3cac 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -89,11 +89,28 @@ cdef inline object create_date_from_ts( def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box="datetime"): - # convert an i8 repr to an ndarray of datetimes (if box == "datetime"), - # Timestamp (if box == "timestamp") or dates (if box == "date") + """ + Convert an i8 repr to an ndarray of datetimes, date or Timestamp + + Parameters + ---------- + arr : array of i8 repr + tz : the timezone to convert to, + can only be used with datetime/Timestamp, + default is None + freq : frequency to be used when converting to Timestamp, default is None + box : the dtype to convert to, default is datetime + If datetime, convert to datetime.datetime + If date, convert to datetime.date + If Timestamp, convert to pandas.Timestamp + + Returns + ------- + result : array of dtype specified by box + """ assert ((box == "datetime") or (box == "date") or (box == "timestamp")), \ - "box must be one of 'datetime', 'date' or 'timestamp'" + "box must be one of 'datetime', 'date' or 'timestamp'" cdef: Py_ssize_t i, n = len(arr) @@ -105,6 +122,8 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, object (*func_create)(int64_t, pandas_datetimestruct, object, object) if box == "date": + assert (tz is not None), "tz should be None when converting to date" + func_create = create_date_from_ts elif box == "timestamp": func_create = create_timestamp_from_ts @@ -115,7 +134,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, elif box == "datetime": func_create = create_datetime_from_ts - if tz is not None and box != "date": + if tz is not None: if is_utc(tz): for i in range(n): value = arr[i] diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 085c45abdd483..7f9245bb31530 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -410,7 +410,7 @@ def convert_to_pydatetime(x, axis): elif x.dtype == _TD_DTYPE: shape = x.shape - x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box="True") + x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) x = x.reshape(shape) if axis == 1: From d93276111a84cff803bf29cf702688700aabd203 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Sat, 11 Nov 2017 17:18:45 -0500 Subject: [PATCH 5/9] Fix assertion in ints_to_pydatetime --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index a2b4dac5b3cac..d0a2a48a655f0 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -122,7 +122,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, object (*func_create)(int64_t, pandas_datetimestruct, object, object) if box == "date": - assert (tz is not None), "tz should be None when converting to date" + assert (tz is None), "tz should be None when converting to date" func_create = create_date_from_ts elif box == "timestamp": From 67982c73f66030e26e4723eb9e0f7a8cd7cb7c03 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Sat, 11 Nov 2017 22:59:51 -0500 Subject: [PATCH 6/9] Update whatsnew for 0.22, add asv for DatetimeIndex.date --- asv_bench/benchmarks/timeseries.py | 2 ++ doc/source/whatsnew/v0.22.0.txt | 2 ++ 2 files changed, 4 insertions(+) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 9614a63332609..84d04e6ad110b 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -107,6 +107,8 @@ def time_infer_freq_daily(self): def time_infer_freq_business(self): infer_freq(self.b_freq) + def time_to_date(self): + self.rng.date class TimeDatetimeConverter(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4f403ff8053a7..b845e84d433f7 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -91,6 +91,8 @@ Performance Improvements - Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) - The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`) - ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) +- Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) +- .. _whatsnew_0220.docs: From 6a10a210b7d1a18550fbf55b5d55b2022e285010 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Sun, 12 Nov 2017 16:07:48 -0500 Subject: [PATCH 7/9] Add asv for dti.time and dti.to_pydatetime, remove to_date() from DatetimeIndex --- asv_bench/benchmarks/timeseries.py | 6 ++++++ pandas/core/indexes/datetimes.py | 12 +----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 84d04e6ad110b..2ca2416f58b57 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -89,6 +89,9 @@ def time_dti_factorize(self): def time_dti_tz_factorize(self): self.dti_tz.factorize() + def time_dti_time(self): + self.rng.time + def time_timestamp_tzinfo_cons(self): self.rng5[0] @@ -110,6 +113,9 @@ def time_infer_freq_business(self): def time_to_date(self): self.rng.date + def time_to_pydatetime(self): + self.rng.to_pydatetime() + class TimeDatetimeConverter(object): goal_time = 0.2 diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d4d0067e598dc..bc9ceebab8fac 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -976,16 +976,6 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.values.copy() - def to_pydate(self): - """ - Return DateTimeIndex as object ndarray of datetime.date objects - - Returns - ------- - dates : ndarray - """ - return libts.ints_to_pydatetime(self.asi8, box="date") - def to_pydatetime(self): """ Return DatetimeIndex as object ndarray of datetime.datetime objects @@ -1697,7 +1687,7 @@ def date(self): Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information). """ - return self.normalize().to_pydate() + return libts.ints_to_pydatetime(self.normalize().asi8, box="date") def normalize(self): """ From 213b1c46a67e28593b6a4fa2e893ba16b4600f70 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Sun, 12 Nov 2017 22:18:34 -0500 Subject: [PATCH 8/9] Fix doc string for ints_to_pydatetime --- pandas/_libs/tslib.pyx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d0a2a48a655f0..a119e22b8e3ee 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -94,15 +94,15 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, Parameters ---------- - arr : array of i8 repr - tz : the timezone to convert to, - can only be used with datetime/Timestamp, - default is None - freq : frequency to be used when converting to Timestamp, default is None - box : the dtype to convert to, default is datetime - If datetime, convert to datetime.datetime - If date, convert to datetime.date - If Timestamp, convert to pandas.Timestamp + arr : array of i8 + tz : str, default None + convert to this timezone + freq : str/Offset, default None + freq to convert + box : {'datetime', 'timestamp', 'date'}, default 'datetime' + If datetime, convert to datetime.datetime + If date, convert to datetime.date + If Timestamp, convert to pandas.Timestamp Returns ------- From 3b7fa77b60b3f0a66efc8a33552d88153acc86f4 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Mon, 13 Nov 2017 19:56:37 -0500 Subject: [PATCH 9/9] Fix PEP8 issue --- pandas/core/indexes/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index bc9ceebab8fac..e08bf4a625bce 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1687,7 +1687,7 @@ def date(self): Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information). """ - return libts.ints_to_pydatetime(self.normalize().asi8, box="date") + return libts.ints_to_pydatetime(self.normalize().asi8, box="date") def normalize(self): """