Skip to content

Commit 8f6a68e

Browse files
Daniel Rothenbergshoyer
authored andcommitted
Add DatetimeAccessor for accessing datetime fields via .dt attribute (#1356)
* Add DatetimeAccessor for accessing datetime fields via `.dt` attribute * Cleaning up unit tests * Cleaning up comments and warnings in accessors * Indirectly access pandas tslib through Series accessors * Re-factor injection of datetime field accessor properties * Undo loop/injection of _get_date_field accessors * Remove public-facing dt property * Remove extra 'field' argument from _tslib_field_accessor * Added support for dask arrays * Added dask test cases Fixed a bug where data wasn't computed in correct order * Simplified _get_date_field for both dask/numpy arrays; additional code review cleanups * Fixing flake8 complaints * Adding whats-new entry * Updated timeseries docs with note about dt accessor * Moved season accessor to DatetimeAccessor * Re-factor virtual variable logic to lean on DateTimeAccessor * Added "Returns" documentation to _get_date_field Fixed imports to facilitate more direct implementation of DateTimeAccessor as a property in DataArray Moved _access_through_series to a top-level function in accessors.py so that dask serialization will hopefully work a bit better * Adding timestamp accessor * Hard-coding expected dtypes for each datetime field * Fix typo in non-datetime virtual variable access * Update What's New and timeseries docs
1 parent ab4ffee commit 8f6a68e

File tree

8 files changed

+287
-17
lines changed

8 files changed

+287
-17
lines changed

doc/time-series.rst

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,22 @@ For more details, read the pandas documentation.
8888
Datetime components
8989
-------------------
9090

91-
xarray supports a notion of "virtual" or "derived" coordinates for
91+
Similar `to pandas`_, the components of datetime objects contained in a
92+
given ``DataArray`` can be quickly computed using a special ``.dt`` accessor.
93+
94+
.. _to pandas: http://pandas.pydata.org/pandas-docs/stable/basics.html#basics-dt-accessors
95+
96+
.. ipython:: python
97+
98+
time = time = pd.date_range('2000-01-01', freq='6H', periods=365 * 4)
99+
ds = xr.Dataset({'foo': ('time', np.arange(365 * 24)), 'time': time})
100+
ds.time.dt.hour
101+
ds.time.dt.dayofweek
102+
103+
The ``.dt`` accessor works on both coordinate dimensions as well as
104+
multi-dimensional data.
105+
106+
xarray also supports a notion of "virtual" or "derived" coordinates for
92107
`datetime components`__ implemented by pandas, including "year", "month",
93108
"day", "hour", "minute", "second", "dayofyear", "week", "dayofweek", "weekday"
94109
and "quarter":
@@ -100,11 +115,13 @@ __ http://pandas.pydata.org/pandas-docs/stable/api.html#time-date-components
100115
ds['time.month']
101116
ds['time.dayofyear']
102117
103-
xarray adds ``'season'`` to the list of datetime components supported by pandas:
118+
For use as a derived coordinate, xarray adds ``'season'`` to the list of
119+
datetime components supported by pandas:
104120

105121
.. ipython:: python
106122
107123
ds['time.season']
124+
ds['time'].dt.season
108125
109126
The set of valid seasons consists of 'DJF', 'MAM', 'JJA' and 'SON', labeled by
110127
the first letters of the corresponding months.
@@ -124,7 +141,7 @@ calculate the mean by time of day:
124141
125142
For upsampling or downsampling temporal resolutions, xarray offers a
126143
:py:meth:`~xarray.Dataset.resample` method building on the core functionality
127-
offered by the pandas method of the same name. Resample uses essentialy the
144+
offered by the pandas method of the same name. Resample uses essentially the
128145
same api as ``resample`` `in pandas`_.
129146

130147
.. _in pandas: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#up-and-downsampling

doc/whats-new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ What's New
1818
v0.9.6 (unreleased)
1919
-------------------
2020

21+
- Add ``.dt`` accessor to DataArrays for computing datetime-like properties
22+
for the values they contain, similar to ``pandas.Series`` (:issue:`358`).
23+
By `Daniel Rothenberg <https://github.com/darothen>`_.
24+
2125
Enhancements
2226
~~~~~~~~~~~~
2327

xarray/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import absolute_import
22
from __future__ import division
33
from __future__ import print_function
4+
45
from .core.alignment import align, broadcast, broadcast_arrays
56
from .core.common import full_like, zeros_like, ones_like
67
from .core.combine import concat, auto_combine

xarray/core/accessors.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
from __future__ import absolute_import
2+
from __future__ import division
3+
from __future__ import print_function
4+
5+
from .common import is_datetime_like
6+
from .pycompat import dask_array_type
7+
8+
from functools import partial
9+
10+
import numpy as np
11+
import pandas as pd
12+
13+
14+
def _season_from_months(months):
15+
"""Compute season (DJF, MAM, JJA, SON) from month ordinal
16+
"""
17+
# TODO: Move "season" accessor upstream into pandas
18+
seasons = np.array(['DJF', 'MAM', 'JJA', 'SON'])
19+
months = np.asarray(months)
20+
return seasons[(months // 3) % 4]
21+
22+
23+
def _access_through_series(values, name):
24+
"""Coerce an array of datetime-like values to a pandas Series and
25+
access requested datetime component
26+
"""
27+
values_as_series = pd.Series(values.ravel())
28+
if name == "season":
29+
months = values_as_series.dt.month.values
30+
field_values = _season_from_months(months)
31+
else:
32+
field_values = getattr(values_as_series.dt, name).values
33+
return field_values.reshape(values.shape)
34+
35+
36+
def _get_date_field(values, name, dtype):
37+
"""Indirectly access pandas' libts.get_date_field by wrapping data
38+
as a Series and calling through `.dt` attribute.
39+
40+
Parameters
41+
----------
42+
values : np.ndarray or dask.array-like
43+
Array-like container of datetime-like values
44+
name : str
45+
Name of datetime field to access
46+
dtype : dtype-like
47+
dtype for output date field values
48+
49+
Returns
50+
-------
51+
datetime_fields : same type as values
52+
Array-like of datetime fields accessed for each element in values
53+
54+
"""
55+
if isinstance(values, dask_array_type):
56+
from dask.array import map_blocks
57+
return map_blocks(_access_through_series,
58+
values, name, dtype=dtype)
59+
else:
60+
return _access_through_series(values, name)
61+
62+
63+
class DatetimeAccessor(object):
64+
"""Access datetime fields for DataArrays with datetime-like dtypes.
65+
66+
Similar to pandas, fields can be accessed through the `.dt` attribute
67+
for applicable DataArrays:
68+
69+
>>> ds = xarray.Dataset({'time': pd.date_range(start='2000/01/01',
70+
... freq='D', periods=100)})
71+
>>> ds.time.dt
72+
<xarray.core.accessors.DatetimeAccessor at 0x10c369f60>
73+
>>> ds.time.dt.dayofyear[:5]
74+
<xarray.DataArray 'dayofyear' (time: 5)>
75+
array([1, 2, 3, 4, 5], dtype=int32)
76+
Coordinates:
77+
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 ...
78+
79+
All of the pandas fields are accessible here. Note that these fields are
80+
not calendar-aware; if your datetimes are encoded with a non-Gregorian
81+
calendar (e.g. a 360-day calendar) using netcdftime, then some fields like
82+
`dayofyear` may not be accurate.
83+
84+
"""
85+
def __init__(self, xarray_obj):
86+
if not is_datetime_like(xarray_obj.dtype):
87+
raise TypeError("'dt' accessor only available for "
88+
"DataArray with datetime64 or timedelta64 dtype")
89+
self._obj = xarray_obj
90+
91+
def _tslib_field_accessor(name, docstring=None, dtype=None):
92+
def f(self, dtype=dtype):
93+
if dtype is None:
94+
dtype = self._obj.dtype
95+
obj_type = type(self._obj)
96+
result = _get_date_field(self._obj.data, name, dtype)
97+
return obj_type(result, name=name,
98+
coords=self._obj.coords, dims=self._obj.dims)
99+
100+
f.__name__ = name
101+
f.__doc__ = docstring
102+
return property(f)
103+
104+
year = _tslib_field_accessor('year', "The year of the datetime", np.int64)
105+
month = _tslib_field_accessor(
106+
'month', "The month as January=1, December=12", np.int64
107+
)
108+
day = _tslib_field_accessor('day', "The days of the datetime", np.int64)
109+
hour = _tslib_field_accessor('hour', "The hours of the datetime", np.int64)
110+
minute = _tslib_field_accessor(
111+
'minute', "The minutes of the datetime", np.int64
112+
)
113+
second = _tslib_field_accessor(
114+
'second', "The seconds of the datetime", np.int64
115+
)
116+
microsecond = _tslib_field_accessor(
117+
'microsecond', "The microseconds of the datetime", np.int64
118+
)
119+
nanosecond = _tslib_field_accessor(
120+
'nanosecond', "The nanoseconds of the datetime", np.int64
121+
)
122+
weekofyear = _tslib_field_accessor(
123+
'weekofyear', "The week ordinal of the year", np.int64
124+
)
125+
week = weekofyear
126+
dayofweek = _tslib_field_accessor(
127+
'dayofweek', "The day of the week with Monday=0, Sunday=6", np.int64
128+
)
129+
weekday = dayofweek
130+
131+
weekday_name = _tslib_field_accessor(
132+
'weekday_name', "The name of day in a week (ex: Friday)", object
133+
)
134+
135+
dayofyear = _tslib_field_accessor(
136+
'dayofyear', "The ordinal day of the year", np.int64
137+
)
138+
quarter = _tslib_field_accessor('quarter', "The quarter of the date")
139+
days_in_month = _tslib_field_accessor(
140+
'days_in_month', "The number of days in the month", np.int64
141+
)
142+
daysinmonth = days_in_month
143+
144+
season = _tslib_field_accessor(
145+
"season", "Season of the year (ex: DJF)", object
146+
)
147+
148+
time = _tslib_field_accessor(
149+
"time", "Timestamps corresponding to datetimes", object
150+
)

xarray/core/common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,3 +761,10 @@ def ones_like(other, dtype=None):
761761
"""Shorthand for full_like(other, 1, dtype)
762762
"""
763763
return full_like(other, 1, dtype)
764+
765+
766+
def is_datetime_like(dtype):
767+
"""Check if a dtype is a subclass of the numpy datetime types
768+
"""
769+
return (np.issubdtype(dtype, np.datetime64) or
770+
np.issubdtype(dtype, np.timedelta64))

xarray/core/dataarray.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from . import rolling
1616
from . import ops
1717
from . import utils
18+
from .accessors import DatetimeAccessor
1819
from .alignment import align, reindex_like_indexers
1920
from .common import AbstractArray, BaseDataObject
2021
from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource,
@@ -158,6 +159,7 @@ class DataArray(AbstractArray, BaseDataObject):
158159
"""
159160
_groupby_cls = groupby.DataArrayGroupBy
160161
_rolling_cls = rolling.DataArrayRolling
162+
dt = property(DatetimeAccessor)
161163

162164
def __init__(self, data, coords=None, dims=None, name=None,
163165
attrs=None, encoding=None, fastpath=False):

xarray/core/dataset.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from .. import conventions
2222
from .alignment import align
2323
from .coordinates import DatasetCoordinates, LevelCoordinatesSource, Indexes
24-
from .common import ImplementsDatasetReduce, BaseDataObject
24+
from .common import ImplementsDatasetReduce, BaseDataObject, is_datetime_like
2525
from .merge import (dataset_update_method, dataset_merge_method,
2626
merge_data_and_coords)
2727
from .utils import (Frozen, SortedKeysDict, maybe_wrap_array, hashable,
@@ -32,6 +32,8 @@
3232
integer_types, dask_array_type, range)
3333
from .options import OPTIONS
3434

35+
import xarray as xr
36+
3537
# list of attributes of pd.DatetimeIndex that are ndarrays of time info
3638
_DATETIMEINDEX_COMPONENTS = ['year', 'month', 'day', 'hour', 'minute',
3739
'second', 'microsecond', 'nanosecond', 'date',
@@ -74,20 +76,11 @@ def _get_virtual_variable(variables, key, level_vars=None, dim_sizes=None):
7476
virtual_var = ref_var
7577
var_name = key
7678
else:
77-
if ref_var.ndim == 1:
78-
date = ref_var.to_index()
79-
elif ref_var.ndim == 0:
80-
date = pd.Timestamp(ref_var.values)
81-
else:
82-
raise KeyError(key)
83-
84-
if var_name == 'season':
85-
# TODO: move 'season' into pandas itself
86-
seasons = np.array(['DJF', 'MAM', 'JJA', 'SON'])
87-
month = date.month
88-
data = seasons[(month // 3) % 4]
79+
if is_datetime_like(ref_var.dtype):
80+
ref_var = xr.DataArray(ref_var)
81+
data = getattr(ref_var.dt, var_name).data
8982
else:
90-
data = getattr(date, var_name)
83+
data = getattr(ref_var, var_name).data
9184
virtual_var = Variable(ref_var.dims, data)
9285

9386
return ref_name, var_name, virtual_var

xarray/tests/test_accessors.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
from __future__ import absolute_import
2+
from __future__ import division
3+
from __future__ import print_function
4+
5+
import xarray as xr
6+
import numpy as np
7+
import pandas as pd
8+
9+
from . import TestCase, requires_dask
10+
11+
12+
class TestDatetimeAccessor(TestCase):
13+
def setUp(self):
14+
nt = 100
15+
data = np.random.rand(10, 10, nt)
16+
lons = np.linspace(0, 11, 10)
17+
lats = np.linspace(0, 20, 10)
18+
self.times = pd.date_range(start="2000/01/01", freq='H', periods=nt)
19+
20+
self.data = xr.DataArray(data, coords=[lons, lats, self.times],
21+
dims=['lon', 'lat', 'time'], name='data')
22+
23+
self.times_arr = np.random.choice(self.times, size=(10, 10, nt))
24+
self.times_data = xr.DataArray(self.times_arr,
25+
coords=[lons, lats, self.times],
26+
dims=['lon', 'lat', 'time'],
27+
name='data')
28+
29+
def test_field_access(self):
30+
years = xr.DataArray(self.times.year, name='year',
31+
coords=[self.times, ], dims=['time', ])
32+
months = xr.DataArray(self.times.month, name='month',
33+
coords=[self.times, ], dims=['time', ])
34+
days = xr.DataArray(self.times.day, name='day',
35+
coords=[self.times, ], dims=['time', ])
36+
hours = xr.DataArray(self.times.hour, name='hour',
37+
coords=[self.times, ], dims=['time', ])
38+
39+
self.assertDataArrayEqual(years, self.data.time.dt.year)
40+
self.assertDataArrayEqual(months, self.data.time.dt.month)
41+
self.assertDataArrayEqual(days, self.data.time.dt.day)
42+
self.assertDataArrayEqual(hours, self.data.time.dt.hour)
43+
44+
def test_not_datetime_type(self):
45+
nontime_data = self.data.copy()
46+
int_data = np.arange(len(self.data.time)).astype('int8')
47+
nontime_data['time'].values = int_data
48+
with self.assertRaisesRegexp(TypeError, 'dt'):
49+
nontime_data.time.dt
50+
51+
@requires_dask
52+
def test_dask_field_access(self):
53+
import dask.array as da
54+
55+
years = self.times_data.dt.year
56+
months = self.times_data.dt.month
57+
hours = self.times_data.dt.hour
58+
days = self.times_data.dt.day
59+
60+
dask_times_arr = da.from_array(self.times_arr, chunks=(5, 5, 50))
61+
dask_times_2d = xr.DataArray(dask_times_arr,
62+
coords=self.data.coords,
63+
dims=self.data.dims,
64+
name='data')
65+
dask_year = dask_times_2d.dt.year
66+
dask_month = dask_times_2d.dt.month
67+
dask_day = dask_times_2d.dt.day
68+
dask_hour = dask_times_2d.dt.hour
69+
70+
# Test that the data isn't eagerly evaluated
71+
assert isinstance(dask_year.data, da.Array)
72+
assert isinstance(dask_month.data, da.Array)
73+
assert isinstance(dask_day.data, da.Array)
74+
assert isinstance(dask_hour.data, da.Array)
75+
76+
# Double check that outcome chunksize is unchanged
77+
dask_chunks = dask_times_2d.chunks
78+
self.assertEqual(dask_year.data.chunks, dask_chunks)
79+
self.assertEqual(dask_month.data.chunks, dask_chunks)
80+
self.assertEqual(dask_day.data.chunks, dask_chunks)
81+
self.assertEqual(dask_hour.data.chunks, dask_chunks)
82+
83+
# Check the actual output from the accessors
84+
self.assertDataArrayEqual(years, dask_year.compute())
85+
self.assertDataArrayEqual(months, dask_month.compute())
86+
self.assertDataArrayEqual(days, dask_day.compute())
87+
self.assertDataArrayEqual(hours, dask_hour.compute())
88+
89+
def test_seasons(self):
90+
dates = pd.date_range(start="2000/01/01", freq="M", periods=12)
91+
dates = xr.DataArray(dates)
92+
seasons = ["DJF", "DJF", "MAM", "MAM", "MAM", "JJA", "JJA", "JJA",
93+
"SON", "SON", "SON", "DJF"]
94+
seasons = xr.DataArray(seasons)
95+
96+
self.assertArrayEqual(seasons.values, dates.dt.season.values)

0 commit comments

Comments
 (0)