-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Add DatetimeAccessor for accessing datetime fields via .dt
attribute
#1356
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
4c4447d
c2bdd40
f51c707
5308642
621d144
daaacb7
0198c3d
8c38bf9
d4fcb49
9a616ef
e303e89
e35d0d0
f621ecc
74f8756
5ae4e08
0788549
d842159
426a16a
66cdb59
9c2fe26
14ac55c
d50420b
b286313
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
from .common import is_datetime_like | ||
from .extensions import register_dataarray_accessor | ||
from .pycompat import dask_array_type | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def _get_date_field_from_dask(values, name): | ||
"""Specialized date field accessor for data contained in dask arrays | ||
""" | ||
from dask import delayed | ||
from dask.array import from_delayed, map_blocks | ||
from dask.dataframe import from_array | ||
|
||
@delayed | ||
def _getattr_from_dask(accessor): | ||
attr_values = getattr(accessor, name).values | ||
return attr_values.compute() | ||
|
||
def _ravel_and_access(darr): | ||
raveled = darr.ravel() | ||
raveled_as_series = from_array(raveled[..., np.newaxis], | ||
columns=['_raveled_data', ]) | ||
field_values = _getattr_from_dask(raveled_as_series['_raveled_data'].dt) | ||
field_values = from_delayed(field_values, shape=raveled.shape, | ||
dtype=raveled.dtype) | ||
return field_values.reshape(darr.shape).compute() | ||
|
||
return map_blocks(_ravel_and_access, values, dtype=values.dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
|
||
def _get_date_field(values, name): | ||
"""Indirectly access pandas' libts.get_date_field by wrapping data | ||
as a Series and calling through `.dt` attribute. | ||
|
||
Parameters | ||
---------- | ||
values : np.ndarray or dask.array-like | ||
Array-like container of datetime-like values | ||
name : str | ||
Name of datetime field to access | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add |
||
""" | ||
if isinstance(values, dask_array_type): | ||
return _get_date_field_from_dask(values, name) | ||
else: | ||
values_as_series = pd.Series(values.ravel()) | ||
field_values = getattr(values_as_series.dt, name).values | ||
return field_values.reshape(values.shape) | ||
|
||
|
||
@register_dataarray_accessor('dt') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another thought: since this isn't being defined outside of xarray, it might actually make sense to import this class from
That has the advantage of being more transparent about whether it comes from. |
||
class DatetimeAccessor(object): | ||
"""Access datetime fields for DataArrays with datetime-like dtypes. | ||
|
||
Similar to pandas, fields can be accessed through the `.dt` attribute | ||
for applicable DataArrays: | ||
|
||
>>> ds = xarray.Dataset({'time': pd.date_range(start='2000/01/01', | ||
... freq='D', periods=100)}) | ||
>>> ds.time.dt | ||
<xarray.core.accessors.DatetimeAccessor at 0x10c369f60> | ||
>>> ds.time.dt.dayofyear[:5] | ||
<xarray.DataArray 'dayofyear' (time: 5)> | ||
array([1, 2, 3, 4, 5], dtype=int32) | ||
Coordinates: | ||
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 ... | ||
|
||
All of the pandas fields are accessible here. Note that these fields are not | ||
calendar-aware; if your datetimes are encoded with a non-Gregorian calendar | ||
(e.g. a 360-day calendar) using netcdftime, then some fields like | ||
`dayofyear` may not be accurate. | ||
|
||
""" | ||
def __init__(self, xarray_obj): | ||
if not is_datetime_like(xarray_obj.dtype): | ||
raise TypeError("'dt' accessor only available for " | ||
"DataArray with datetime64 or timedelta64 dtype") | ||
self._obj = xarray_obj | ||
self._dt = self._obj.data | ||
|
||
_field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you don't need this variable anymore. |
||
'weekofyear', 'week', 'weekday', 'dayofweek', | ||
'dayofyear', 'quarter', 'days_in_month', | ||
'daysinmonth', 'microsecond', | ||
'nanosecond'] | ||
|
||
def _tslib_field_accessor(name, docstring=None): | ||
def f(self): | ||
from .dataarray import DataArray | ||
result = _get_date_field(self._dt, name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not really sure why you need |
||
return DataArray(result, name=name, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can also just do |
||
coords=self._obj.coords, dims=self._obj.dims) | ||
|
||
f.__name__ = name | ||
f.__doc__ = docstring | ||
return property(f) | ||
|
||
year = _tslib_field_accessor('year', "The year of the datetime") | ||
month = _tslib_field_accessor( | ||
'month', "The month as January=1, December=12" | ||
) | ||
day = _tslib_field_accessor('day', "The days of the datetime") | ||
hour = _tslib_field_accessor('hour', "The hours of the datetime") | ||
minute = _tslib_field_accessor('minute', "The minutes of the datetime") | ||
second = _tslib_field_accessor('second', "The seconds of the datetime") | ||
microsecond = _tslib_field_accessor( | ||
'microsecond', "The microseconds of the datetime" | ||
) | ||
nanosecond = _tslib_field_accessor( | ||
'nanosecond', "The nanoseconds of the datetime" | ||
) | ||
weekofyear = _tslib_field_accessor( | ||
'weekofyear', "The week ordinal of the year" | ||
) | ||
week = weekofyear | ||
dayofweek = _tslib_field_accessor( | ||
'dayofweek', "The day of the week with Monday=0, Sunday=6" | ||
) | ||
weekday = dayofweek | ||
|
||
weekday_name = _tslib_field_accessor( | ||
'weekday_name', "The name of day in a week (ex: Friday)" | ||
) | ||
|
||
dayofyear = _tslib_field_accessor('dayofyear', "The ordinal day of the year") | ||
quarter = _tslib_field_accessor('quarter', "The quarter of the date") | ||
days_in_month = _tslib_field_accessor( | ||
'days_in_month', "The number of days in the month" | ||
) | ||
daysinmonth = days_in_month |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
try: | ||
import cPickle as pickle | ||
except ImportError: | ||
import pickle | ||
|
||
import xarray as xr | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from . import TestCase, requires_dask | ||
|
||
|
||
class TestDatetimeAccessor(TestCase): | ||
def setUp(self): | ||
nt = 100 | ||
data = np.random.rand(10, 10, nt) | ||
lons = np.linspace(0, 11, 10) | ||
lats = np.linspace(0, 20, 10) | ||
self.times = pd.date_range(start="2000/01/01", freq='H', periods=nt) | ||
self.times_arr = np.random.choice(self.times, size=(10, 10, nt)) | ||
|
||
self.data = xr.DataArray(data, coords=[lons, lats, self.times], | ||
dims=['lon', 'lat', 'time'], name='data') | ||
|
||
def test_field_access(self): | ||
years = xr.DataArray(self.times.year, name='year', | ||
coords=[self.times, ], dims=['time', ]) | ||
months = xr.DataArray(self.times.month, name='month', | ||
coords=[self.times, ], dims=['time', ]) | ||
days = xr.DataArray(self.times.day, name='day', | ||
coords=[self.times, ], dims=['time', ]) | ||
hours = xr.DataArray(self.times.hour, name='hour', | ||
coords=[self.times, ], dims=['time', ]) | ||
|
||
|
||
self.assertDataArrayEqual(years, self.data.time.dt.year) | ||
self.assertDataArrayEqual(months, self.data.time.dt.month) | ||
self.assertDataArrayEqual(days, self.data.time.dt.day) | ||
self.assertDataArrayEqual(hours, self.data.time.dt.hour) | ||
|
||
def test_not_datetime_type(self): | ||
nontime_data = self.data.copy() | ||
int_data = np.arange(len(self.data.time)).astype('int8') | ||
nontime_data['time'].values = int_data | ||
with self.assertRaisesRegexp(TypeError, 'dt'): | ||
nontime_data.time.dt.year | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would use simply |
||
|
||
@requires_dask | ||
def test_dask_field_access(self): | ||
import dask.array as da | ||
|
||
# Safely pre-compute comparison fields by passing through Pandas | ||
# machinery | ||
def _getattr_and_reshape(arr, attr): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's OK if this unit test relies on the correctness of computing date fields on NumPy arrays -- you already check that in other tests. So I would get rid of this and simply rely on these fields already working, e.g., just use |
||
data = getattr(arr.dt, attr).values.reshape(self.times_arr.shape) | ||
return xr.DataArray(data, coords=self.data.coords, | ||
dims=self.data.dims, name=attr) | ||
times_arr_as_series = pd.Series(self.times_arr.ravel()) | ||
years = _getattr_and_reshape(times_arr_as_series, 'year') | ||
months = _getattr_and_reshape(times_arr_as_series,'month') | ||
days = _getattr_and_reshape(times_arr_as_series, 'day') | ||
hours = _getattr_and_reshape(times_arr_as_series, 'hour') | ||
|
||
dask_times_arr = da.from_array(self.times_arr, chunks=(5, 5, 50)) | ||
dask_times_2d = xr.DataArray(dask_times_arr, | ||
coords=self.data.coords, | ||
dims=self.data.dims, | ||
name='data') | ||
dask_year = dask_times_2d.dt.year | ||
dask_month = dask_times_2d.dt.month | ||
dask_day = dask_times_2d.dt.day | ||
dask_hour = dask_times_2d.dt.hour | ||
|
||
# Test that the data isn't eagerly evaluated | ||
assert isinstance(dask_year.data, da.Array) | ||
assert isinstance(dask_month.data, da.Array) | ||
assert isinstance(dask_day.data, da.Array) | ||
assert isinstance(dask_hour.data, da.Array) | ||
|
||
# Double check that outcome chunksize is unchanged | ||
dask_chunks = dask_times_2d.chunks | ||
self.assertEqual(dask_year.data.chunks, dask_chunks) | ||
self.assertEqual(dask_month.data.chunks, dask_chunks) | ||
self.assertEqual(dask_day.data.chunks, dask_chunks) | ||
self.assertEqual(dask_hour.data.chunks, dask_chunks) | ||
|
||
# Check the actual output from the accessors | ||
self.assertDataArrayEqual(years, dask_year.compute()) | ||
self.assertDataArrayEqual(months, dask_month.compute()) | ||
self.assertDataArrayEqual(days, dask_day.compute()) | ||
self.assertDataArrayEqual(hours, dask_hour.compute()) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is more complicated than it needs to, as evidenced by the nested calls to compute. The great thing about
map_blocks
is that you can make everything eager, e.g.,That's it!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wow! The time spent on tackling this project is well worth it just for this tidbit alone. I'll circle back to clean up this stuff within the next day or so.