Skip to content

Commit 10f0227

Browse files
Ensure maximum accuracy when encoding and decoding cftime.datetime values (#4758)
1 parent aba46cc commit 10f0227

File tree

6 files changed

+155
-32
lines changed

6 files changed

+155
-32
lines changed

doc/whats-new.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ Deprecations
5454

5555
New Features
5656
~~~~~~~~~~~~
57+
- Xarray now leverages updates as of cftime version 1.4.1, which enable exact I/O
58+
roundtripping of ``cftime.datetime`` objects (:pull:`4758`).
59+
By `Spencer Clark <https://github.com/spencerkclark>`_.
60+
- :py:meth:`~xarray.cftime_range` and :py:meth:`DataArray.resample` now support
61+
millisecond (``"L"`` or ``"ms"``) and microsecond (``"U"`` or ``"us"``) frequencies
62+
for ``cftime.datetime`` coordinates (:issue:`4097`, :pull:`4758`).
63+
By `Spencer Clark <https://github.com/spencerkclark>`_.
5764
- Significantly higher ``unstack`` performance on numpy-backed arrays which
5865
contain missing values; 8x faster in our benchmark, and 2x faster than pandas.
5966
(:pull:`4746`);

xarray/coding/cftime_offsets.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,26 @@ def __apply__(self, other):
576576
return other + self.as_timedelta()
577577

578578

579+
class Millisecond(BaseCFTimeOffset):
580+
_freq = "L"
581+
582+
def as_timedelta(self):
583+
return timedelta(milliseconds=self.n)
584+
585+
def __apply__(self, other):
586+
return other + self.as_timedelta()
587+
588+
589+
class Microsecond(BaseCFTimeOffset):
590+
_freq = "U"
591+
592+
def as_timedelta(self):
593+
return timedelta(microseconds=self.n)
594+
595+
def __apply__(self, other):
596+
return other + self.as_timedelta()
597+
598+
579599
_FREQUENCIES = {
580600
"A": YearEnd,
581601
"AS": YearBegin,
@@ -590,6 +610,10 @@ def __apply__(self, other):
590610
"T": Minute,
591611
"min": Minute,
592612
"S": Second,
613+
"L": Millisecond,
614+
"ms": Millisecond,
615+
"U": Microsecond,
616+
"us": Microsecond,
593617
"AS-JAN": partial(YearBegin, month=1),
594618
"AS-FEB": partial(YearBegin, month=2),
595619
"AS-MAR": partial(YearBegin, month=3),
@@ -824,7 +848,7 @@ def cftime_range(
824848
`ISO-8601 format <https://en.wikipedia.org/wiki/ISO_8601>`_.
825849
- It supports many, but not all, frequencies supported by
826850
``pandas.date_range``. For example it does not currently support any of
827-
the business-related, semi-monthly, or sub-second frequencies.
851+
the business-related or semi-monthly frequencies.
828852
- Compound sub-monthly frequencies are not supported, e.g. '1H1min', as
829853
these can easily be written in terms of the finest common resolution,
830854
e.g. '61min'.
@@ -855,6 +879,10 @@ def cftime_range(
855879
+--------+--------------------------+
856880
| S | Second frequency |
857881
+--------+--------------------------+
882+
| L, ms | Millisecond frequency |
883+
+--------+--------------------------+
884+
| U, us | Microsecond frequency |
885+
+--------+--------------------------+
858886
859887
Any multiples of the following anchored offsets are also supported.
860888

xarray/coding/times.py

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import re
22
import warnings
3-
from datetime import datetime
3+
from datetime import datetime, timedelta
44
from distutils.version import LooseVersion
55
from functools import partial
66

@@ -35,6 +35,26 @@
3535
"D": int(1e9) * 60 * 60 * 24,
3636
}
3737

38+
_US_PER_TIME_DELTA = {
39+
"microseconds": 1,
40+
"milliseconds": 1_000,
41+
"seconds": 1_000_000,
42+
"minutes": 60 * 1_000_000,
43+
"hours": 60 * 60 * 1_000_000,
44+
"days": 24 * 60 * 60 * 1_000_000,
45+
}
46+
47+
_NETCDF_TIME_UNITS_CFTIME = [
48+
"days",
49+
"hours",
50+
"minutes",
51+
"seconds",
52+
"milliseconds",
53+
"microseconds",
54+
]
55+
56+
_NETCDF_TIME_UNITS_NUMPY = _NETCDF_TIME_UNITS_CFTIME + ["nanoseconds"]
57+
3858
TIME_UNITS = frozenset(
3959
[
4060
"days",
@@ -225,9 +245,7 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None):
225245
if calendar in _STANDARD_CALENDARS:
226246
dates = cftime_to_nptime(dates)
227247
elif use_cftime:
228-
dates = _decode_datetime_with_cftime(
229-
flat_num_dates.astype(float), units, calendar
230-
)
248+
dates = _decode_datetime_with_cftime(flat_num_dates, units, calendar)
231249
else:
232250
dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar)
233251

@@ -262,25 +280,33 @@ def decode_cf_timedelta(num_timedeltas, units):
262280
return result.reshape(num_timedeltas.shape)
263281

264282

283+
def _unit_timedelta_cftime(units):
284+
return timedelta(microseconds=_US_PER_TIME_DELTA[units])
285+
286+
287+
def _unit_timedelta_numpy(units):
288+
numpy_units = _netcdf_to_numpy_timeunit(units)
289+
return np.timedelta64(_NS_PER_TIME_DELTA[numpy_units], "ns")
290+
291+
265292
def _infer_time_units_from_diff(unique_timedeltas):
266-
# Note that the modulus operator was only implemented for np.timedelta64
267-
# arrays as of NumPy version 1.16.0. Once our minimum version of NumPy
268-
# supported is greater than or equal to this we will no longer need to cast
269-
# unique_timedeltas to a TimedeltaIndex. In the meantime, however, the
270-
# modulus operator works for TimedeltaIndex objects.
271-
unique_deltas_as_index = pd.TimedeltaIndex(unique_timedeltas)
272-
for time_unit in [
273-
"days",
274-
"hours",
275-
"minutes",
276-
"seconds",
277-
"milliseconds",
278-
"microseconds",
279-
"nanoseconds",
280-
]:
281-
delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)]
282-
unit_delta = np.timedelta64(delta_ns, "ns")
283-
if np.all(unique_deltas_as_index % unit_delta == np.timedelta64(0, "ns")):
293+
if unique_timedeltas.dtype == np.dtype("O"):
294+
time_units = _NETCDF_TIME_UNITS_CFTIME
295+
unit_timedelta = _unit_timedelta_cftime
296+
zero_timedelta = timedelta(microseconds=0)
297+
timedeltas = unique_timedeltas
298+
else:
299+
time_units = _NETCDF_TIME_UNITS_NUMPY
300+
unit_timedelta = _unit_timedelta_numpy
301+
zero_timedelta = np.timedelta64(0, "ns")
302+
# Note that the modulus operator was only implemented for np.timedelta64
303+
# arrays as of NumPy version 1.16.0. Once our minimum version of NumPy
304+
# supported is greater than or equal to this we will no longer need to cast
305+
# unique_timedeltas to a TimedeltaIndex. In the meantime, however, the
306+
# modulus operator works for TimedeltaIndex objects.
307+
timedeltas = pd.TimedeltaIndex(unique_timedeltas)
308+
for time_unit in time_units:
309+
if np.all(timedeltas % unit_timedelta(time_unit) == zero_timedelta):
284310
return time_unit
285311
return "seconds"
286312

@@ -309,10 +335,6 @@ def infer_datetime_units(dates):
309335
reference_date = dates[0] if len(dates) > 0 else "1970-01-01"
310336
reference_date = format_cftime_datetime(reference_date)
311337
unique_timedeltas = np.unique(np.diff(dates))
312-
if unique_timedeltas.dtype == np.dtype("O"):
313-
# Convert to np.timedelta64 objects using pandas to work around a
314-
# NumPy casting bug: https://github.com/numpy/numpy/issues/11096
315-
unique_timedeltas = to_timedelta_unboxed(unique_timedeltas)
316338
units = _infer_time_units_from_diff(unique_timedeltas)
317339
return f"{units} since {reference_date}"
318340

xarray/tests/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def LooseVersion(vstring):
6868
has_pseudonetcdf, requires_pseudonetcdf = _importorskip("PseudoNetCDF")
6969
has_cftime, requires_cftime = _importorskip("cftime")
7070
has_cftime_1_1_0, requires_cftime_1_1_0 = _importorskip("cftime", minversion="1.1.0.0")
71+
has_cftime_1_4_1, requires_cftime_1_4_1 = _importorskip("cftime", minversion="1.4.1")
7172
has_dask, requires_dask = _importorskip("dask")
7273
has_bottleneck, requires_bottleneck = _importorskip("bottleneck")
7374
has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis")

xarray/tests/test_cftime_offsets.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
BaseCFTimeOffset,
1111
Day,
1212
Hour,
13+
Microsecond,
14+
Millisecond,
1315
Minute,
1416
MonthBegin,
1517
MonthEnd,
@@ -181,6 +183,14 @@ def test_to_offset_offset_input(offset):
181183
("2min", Minute(n=2)),
182184
("S", Second()),
183185
("2S", Second(n=2)),
186+
("L", Millisecond(n=1)),
187+
("2L", Millisecond(n=2)),
188+
("ms", Millisecond(n=1)),
189+
("2ms", Millisecond(n=2)),
190+
("U", Microsecond(n=1)),
191+
("2U", Microsecond(n=2)),
192+
("us", Microsecond(n=1)),
193+
("2us", Microsecond(n=2)),
184194
],
185195
ids=_id_func,
186196
)
@@ -299,6 +309,8 @@ def test_to_cftime_datetime_error_type_error():
299309
Hour(),
300310
Minute(),
301311
Second(),
312+
Millisecond(),
313+
Microsecond(),
302314
]
303315
_EQ_TESTS_B = [
304316
BaseCFTimeOffset(n=2),
@@ -316,6 +328,8 @@ def test_to_cftime_datetime_error_type_error():
316328
Hour(n=2),
317329
Minute(n=2),
318330
Second(n=2),
331+
Millisecond(n=2),
332+
Microsecond(n=2),
319333
]
320334

321335

@@ -340,6 +354,8 @@ def test_neq(a, b):
340354
Hour(n=2),
341355
Minute(n=2),
342356
Second(n=2),
357+
Millisecond(n=2),
358+
Microsecond(n=2),
343359
]
344360

345361

@@ -360,6 +376,8 @@ def test_eq(a, b):
360376
(Hour(), Hour(n=3)),
361377
(Minute(), Minute(n=3)),
362378
(Second(), Second(n=3)),
379+
(Millisecond(), Millisecond(n=3)),
380+
(Microsecond(), Microsecond(n=3)),
363381
]
364382

365383

@@ -387,6 +405,8 @@ def test_rmul(offset, expected):
387405
(Hour(), Hour(n=-1)),
388406
(Minute(), Minute(n=-1)),
389407
(Second(), Second(n=-1)),
408+
(Millisecond(), Millisecond(n=-1)),
409+
(Microsecond(), Microsecond(n=-1)),
390410
],
391411
ids=_id_func,
392412
)
@@ -399,6 +419,8 @@ def test_neg(offset, expected):
399419
(Hour(n=2), (1, 1, 1, 2)),
400420
(Minute(n=2), (1, 1, 1, 0, 2)),
401421
(Second(n=2), (1, 1, 1, 0, 0, 2)),
422+
(Millisecond(n=2), (1, 1, 1, 0, 0, 0, 2000)),
423+
(Microsecond(n=2), (1, 1, 1, 0, 0, 0, 2)),
402424
]
403425

404426

@@ -427,6 +449,8 @@ def test_radd_sub_monthly(offset, expected_date_args, calendar):
427449
(Hour(n=2), (1, 1, 2, 22)),
428450
(Minute(n=2), (1, 1, 2, 23, 58)),
429451
(Second(n=2), (1, 1, 2, 23, 59, 58)),
452+
(Millisecond(n=2), (1, 1, 2, 23, 59, 59, 998000)),
453+
(Microsecond(n=2), (1, 1, 2, 23, 59, 59, 999998)),
430454
],
431455
ids=_id_func,
432456
)
@@ -802,6 +826,8 @@ def test_add_quarter_end_onOffset(
802826
((1, 1, 1), Hour(), True),
803827
((1, 1, 1), Minute(), True),
804828
((1, 1, 1), Second(), True),
829+
((1, 1, 1), Millisecond(), True),
830+
((1, 1, 1), Microsecond(), True),
805831
],
806832
ids=_id_func,
807833
)
@@ -865,6 +891,8 @@ def test_onOffset_month_or_quarter_or_year_end(
865891
(Hour(), (1, 3, 2, 1, 1), (1, 3, 2, 1, 1)),
866892
(Minute(), (1, 3, 2, 1, 1, 1), (1, 3, 2, 1, 1, 1)),
867893
(Second(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)),
894+
(Millisecond(), (1, 3, 2, 1, 1, 1, 1000), (1, 3, 2, 1, 1, 1, 1000)),
895+
(Microsecond(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)),
868896
],
869897
ids=_id_func,
870898
)
@@ -914,6 +942,8 @@ def test_rollforward(calendar, offset, initial_date_args, partial_expected_date_
914942
(Hour(), (1, 3, 2, 1, 1), (1, 3, 2, 1, 1)),
915943
(Minute(), (1, 3, 2, 1, 1, 1), (1, 3, 2, 1, 1, 1)),
916944
(Second(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)),
945+
(Millisecond(), (1, 3, 2, 1, 1, 1, 1000), (1, 3, 2, 1, 1, 1, 1000)),
946+
(Microsecond(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)),
917947
],
918948
ids=_id_func,
919949
)

xarray/tests/test_coding_times.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,21 @@
11
import warnings
2+
from datetime import timedelta
23
from itertools import product
34

45
import numpy as np
56
import pandas as pd
67
import pytest
78
from pandas.errors import OutOfBoundsDatetime
89

9-
from xarray import DataArray, Dataset, Variable, coding, conventions, decode_cf
10+
from xarray import (
11+
DataArray,
12+
Dataset,
13+
Variable,
14+
cftime_range,
15+
coding,
16+
conventions,
17+
decode_cf,
18+
)
1019
from xarray.coding.times import (
1120
_encode_datetime_with_cftime,
1221
cftime_to_nptime,
@@ -19,7 +28,15 @@
1928
from xarray.core.common import contains_cftime_datetimes
2029
from xarray.testing import assert_equal
2130

22-
from . import arm_xfail, assert_array_equal, has_cftime, requires_cftime, requires_dask
31+
from . import (
32+
arm_xfail,
33+
assert_array_equal,
34+
has_cftime,
35+
has_cftime_1_4_1,
36+
requires_cftime,
37+
requires_cftime_1_4_1,
38+
requires_dask,
39+
)
2340

2441
_NON_STANDARD_CALENDARS_SET = {
2542
"noleap",
@@ -973,8 +990,13 @@ def test_decode_ambiguous_time_warns(calendar):
973990

974991
@pytest.mark.parametrize("encoding_units", FREQUENCIES_TO_ENCODING_UNITS.values())
975992
@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys())
976-
def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq):
977-
times = pd.date_range("2000", periods=3, freq=freq)
993+
@pytest.mark.parametrize("date_range", [pd.date_range, cftime_range])
994+
def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq, date_range):
995+
if not has_cftime_1_4_1 and date_range == cftime_range:
996+
pytest.skip("Test requires cftime 1.4.1.")
997+
if (freq == "N" or encoding_units == "nanoseconds") and date_range == cftime_range:
998+
pytest.skip("Nanosecond frequency is not valid for cftime dates.")
999+
times = date_range("2000", periods=3, freq=freq)
9781000
units = f"{encoding_units} since 2000-01-01"
9791001
encoded, _, _ = coding.times.encode_cf_datetime(times, units)
9801002

@@ -987,7 +1009,7 @@ def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq):
9871009

9881010

9891011
@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys())
990-
def test_encode_decode_roundtrip(freq):
1012+
def test_encode_decode_roundtrip_datetime64(freq):
9911013
# See GH 4045. Prior to GH 4684 this test would fail for frequencies of
9921014
# "S", "L", "U", and "N".
9931015
initial_time = pd.date_range("1678-01-01", periods=1)
@@ -998,6 +1020,19 @@ def test_encode_decode_roundtrip(freq):
9981020
assert_equal(variable, decoded)
9991021

10001022

1023+
@requires_cftime_1_4_1
1024+
@pytest.mark.parametrize("freq", ["U", "L", "S", "T", "H", "D"])
1025+
def test_encode_decode_roundtrip_cftime(freq):
1026+
initial_time = cftime_range("0001", periods=1)
1027+
times = initial_time.append(
1028+
cftime_range("0001", periods=2, freq=freq) + timedelta(days=291000 * 365)
1029+
)
1030+
variable = Variable(["time"], times)
1031+
encoded = conventions.encode_cf_variable(variable)
1032+
decoded = conventions.decode_cf_variable("time", encoded, use_cftime=True)
1033+
assert_equal(variable, decoded)
1034+
1035+
10011036
@requires_cftime
10021037
def test__encode_datetime_with_cftime():
10031038
# See GH 4870. cftime versions > 1.4.0 required us to adapt the

0 commit comments

Comments
 (0)