Skip to content

Commit 182b6e1

Browse files
authored
Correctly handle zoneinfo timezones (#2253)
#### Reference Issues/PRs Fixes #2245 #### What does this implement or fix? Correctly normalizes zoneinfo timezones. Currently if we write something as a pytz or a ZoneInfo timezone we'll read it back as a `pytz` to preserve old pytz behavior and not break old arctic readers. [Consensus](pandas-dev/pandas#34916 (comment)) is that people generally prefer `ZoneInfo` and we can change to always using `ZoneInfo` but we'll need to do so in next major release. #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details> <!-- Thanks for contributing a Pull Request to ArcticDB! Please ensure you have taken a look at: - ArcticDB's Code of Conduct: https://github.com/man-group/ArcticDB/blob/master/CODE_OF_CONDUCT.md - ArcticDB's Contribution Licensing: https://github.com/man-group/ArcticDB/blob/master/docs/mkdocs/docs/technical/contributing.md#contribution-licensing -->
1 parent 059a1a3 commit 182b6e1

File tree

5 files changed

+145
-13
lines changed

5 files changed

+145
-13
lines changed

python/arcticdb/version_store/_normalization.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
import copy
1010
import datetime
1111
import io
12+
import sys
13+
if sys.version_info >= (3, 9):
14+
import zoneinfo
1215
from datetime import timedelta
1316
import math
1417

@@ -326,7 +329,7 @@ def _to_tz_timestamp(dt):
326329
second=dt.second,
327330
microsecond=dt.microsecond,
328331
).value
329-
tz = dt.tzinfo.zone if dt.tzinfo is not None else None
332+
tz = _ensure_str_timezone(get_timezone(dt.tzinfo)) if dt.tzinfo is not None else None
330333
return [ts, tz]
331334

332335

@@ -400,14 +403,21 @@ def _normalize_single_index(
400403
return [str(name) for name in index_names], ix_vals
401404

402405

403-
def _ensure_str_timezone(index_tz):
404-
if isinstance(index_tz, datetime.tzinfo) and check_is_utc_if_newer_pandas(index_tz):
406+
def _ensure_str_timezone(tz):
407+
if sys.version_info >= (3, 9):
408+
if isinstance(tz, zoneinfo.ZoneInfo):
409+
# Pandas also special cases zoneinfo. It always just returns it as is instead of converting to string.
410+
# Since we also want to support zoneinfo timezones and we want old arcticdb clients to read them,
411+
# we convert them to a string (similarly to pytz timezones).
412+
return str(tz)
413+
414+
if isinstance(tz, datetime.tzinfo) and check_is_utc_if_newer_pandas(tz):
405415
# Pandas started to treat UTC as a special case and give back the tzinfo object for it. We coerce it back to
406416
# a str to avoid special cases for it further along our pipeline. The breaking change was:
407417
# https://github.com/jbrockmendel/pandas/commit/94ce05d1bcc3c99e992c48cc99d0fd2726f43102#diff-3dba9e959e6ad7c394f0662a0e6477593fca446a6924437701ecff82b0b20b55
408418
return "UTC"
409-
else:
410-
return index_tz
419+
420+
return tz
411421

412422

413423
def _denormalize_single_index(item, norm_meta):
@@ -1019,7 +1029,7 @@ def denormalize(self, obj, meta):
10191029

10201030
def _custom_pack(self, obj):
10211031
if isinstance(obj, pd.Timestamp):
1022-
tz = obj.tz.zone if obj.tz is not None else None
1032+
tz = _ensure_str_timezone(get_timezone(obj.tz)) if obj.tz is not None else None
10231033
return ExtType(MsgPackSerialization.PD_TIMESTAMP, packb([obj.value, tz]))
10241034

10251035
if isinstance(obj, datetime.datetime):
@@ -1036,6 +1046,9 @@ def _custom_pack(self, obj):
10361046
def _ext_hook(self, code, data):
10371047
if code == MsgPackSerialization.PD_TIMESTAMP:
10381048
data = unpackb(data, raw=False)
1049+
# TODO: Pandas by default interprets the `tz` string argument as `pytz`.
1050+
# We should instead use `ZoneInfo` but this will be an API break.
1051+
# See discussion here: https://github.com/pandas-dev/pandas/issues/34916#issuecomment-1595154742
10391052
return pd.Timestamp(data[0], tz=data[1]) if data[1] is not None else pd.Timestamp(data[0])
10401053

10411054
if code == MsgPackSerialization.PY_DATETIME:

python/tests/compat/arcticdb/test_compatibility.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
1+
import sys
12
import pytest
3+
import pytz
4+
if sys.version_info >= (3, 9):
5+
import zoneinfo
26
from packaging import version
37
import pandas as pd
48
import numpy as np
59
from arcticdb.util.test import assert_frame_equal
610
from arcticdb.options import ModifiableEnterpriseLibraryOption
711
from arcticdb.toolbox.library_tool import LibraryTool
8-
from tests.util.mark import ARCTICDB_USING_CONDA
12+
from tests.util.mark import ARCTICDB_USING_CONDA, ZONE_INFO_MARK
913
from arcticdb_ext.tools import StorageMover
1014

1115
from arcticdb.util.venv import CurrentVersion
@@ -187,6 +191,62 @@ def test_compat_snapshot_metadata_read(old_venv_and_arctic_uri, lib_name):
187191
assert meta == {"old_key": "old_value"}
188192

189193

194+
@ZONE_INFO_MARK
195+
@pytest.mark.parametrize("zone_name", ["UTC", "America/New_York"])
196+
def test_compat_timestamp_metadata(old_venv_and_arctic_uri, lib_name, zone_name):
197+
old_venv, arctic_uri = old_venv_and_arctic_uri
198+
199+
old_ac = old_venv.create_arctic(arctic_uri)
200+
old_lib = old_ac.create_library(lib_name)
201+
202+
sym = "sym"
203+
df = pd.DataFrame({"col": [1]})
204+
timestamp_no_tz = pd.Timestamp(2025, 1, 1)
205+
timestamp_pytz = pd.Timestamp(2025, 1, 1, tz=pytz.timezone(zone_name))
206+
timestamp_zoneinfo = pd.Timestamp(2025, 1, 1, tz=zoneinfo.ZoneInfo(zone_name))
207+
208+
# Write two versions with old arctic:
209+
# v0 - no timezone
210+
# v1 - pytz
211+
old_lib.execute([
212+
f"""
213+
import pytz
214+
timestamp_no_tz = pd.Timestamp(year=2025, month=1, day=1)
215+
timestamp_pytz = pd.Timestamp(year=2025, month=1, day=1, tz=pytz.timezone({repr(zone_name)}))
216+
lib.write({repr(sym)}, df, metadata=timestamp_no_tz)
217+
lib.write_metadata({repr(sym)}, timestamp_pytz)
218+
assert lib.read_metadata({repr(sym)}, as_of=0).metadata == timestamp_no_tz
219+
assert lib.read_metadata({repr(sym)}, as_of=1).metadata == timestamp_pytz
220+
"""
221+
], dfs={"df": df})
222+
223+
# Write 3 more versions with current arctic:
224+
# v2 - no timezone
225+
# v3 - pytz
226+
# v4 - zoneinfo
227+
with CurrentVersion(arctic_uri, lib_name) as curr:
228+
lib = curr.lib
229+
assert lib.read_metadata(sym, as_of=0).metadata == timestamp_no_tz
230+
assert lib.read_metadata(sym, as_of=1).metadata == timestamp_pytz
231+
lib.write_metadata(sym, timestamp_no_tz) # v2
232+
lib.write_metadata(sym, timestamp_pytz) # v3
233+
lib.write_metadata(sym, timestamp_zoneinfo) # v4
234+
assert lib.read_metadata(sym, as_of=2).metadata == timestamp_no_tz
235+
assert lib.read_metadata(sym, as_of=3).metadata == timestamp_pytz
236+
assert lib.read_metadata(sym, as_of=4).metadata == timestamp_pytz
237+
238+
old_lib.execute([
239+
f"""
240+
import pytz
241+
timestamp_no_tz = pd.Timestamp(year=2025, month=1, day=1)
242+
timestamp_pytz = pd.Timestamp(year=2025, month=1, day=1, tz=pytz.timezone({repr(zone_name)}))
243+
assert lib.read_metadata({repr(sym)}, as_of=2).metadata == timestamp_no_tz
244+
assert lib.read_metadata({repr(sym)}, as_of=3).metadata == timestamp_pytz
245+
assert lib.read_metadata({repr(sym)}, as_of=4).metadata == timestamp_pytz
246+
"""
247+
])
248+
249+
190250
def test_compat_read_incomplete(old_venv_and_arctic_uri, lib_name):
191251
old_venv, arctic_uri = old_venv_and_arctic_uri
192252
sym = "sym"

python/tests/integration/arcticdb/version_store/test_metadata_support.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,15 @@
55
66
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
77
"""
8+
import sys
9+
810
import numpy as np
911
import pandas as pd
12+
import datetime
13+
if sys.version_info >= (3, 9):
14+
import zoneinfo
15+
import pytz
16+
1017
from arcticdb_ext import set_config_string, unset_config_string
1118
from pandas import DataFrame, Timestamp
1219
import pytest
@@ -16,6 +23,8 @@
1623
from arcticdb_ext.storage import NoDataFoundException
1724
from arcticdb.util.test import assert_frame_equal, distinct_timestamps
1825

26+
from tests.util.mark import ZONE_INFO_MARK
27+
1928

2029
# In the following lines, the naming convention is
2130
# test_rt_df stands for roundtrip dataframe (implicitly pandas given file name)
@@ -283,3 +292,30 @@ def test_rv_contains_metadata_batch_write_metadata(lmdb_version_store_v1):
283292
vits = lib.batch_write_metadata([sym_0, sym_1], [metadata_0, metadata_1])
284293
assert vits[0].metadata == metadata_0
285294
assert vits[1].metadata == metadata_1
295+
296+
297+
@ZONE_INFO_MARK
298+
@pytest.mark.parametrize("zone_type", ["pytz", "zoneinfo"])
299+
@pytest.mark.parametrize("zone_name", ["UTC", "America/Los_Angeles", "Europe/London", "Asia/Tokyo", "Pacific/Kiritimati"])
300+
def test_metadata_timestamp_with_tz(lmdb_version_store_v1, zone_type, zone_name):
301+
lib = lmdb_version_store_v1
302+
sym = "sym"
303+
df = timestamp_indexed_df()
304+
if zone_type == "pytz":
305+
zone_to_write = pytz.timezone(zone_name)
306+
elif zone_type == "zoneinfo":
307+
zone_to_write = zoneinfo.ZoneInfo(zone_name)
308+
else:
309+
raise ValueError("Unknown timezone type")
310+
# We need to normalize all timezoned timestamps to a pd.Timestamp with pytz
311+
expected_metadata = pd.Timestamp(2025, 1, 1, tz=pytz.timezone(zone_name))
312+
313+
# pd.Timestamp
314+
metadata_to_write = pd.Timestamp(2025, 1, 1, tz=zone_to_write)
315+
lib.write(sym, df, metadata_to_write)
316+
assert lib.read(sym).metadata == expected_metadata
317+
318+
# datetime.datetime
319+
metadata_to_write = datetime.datetime(2025, 1, 1, tzinfo=zone_to_write)
320+
lib.write(sym, df, metadata_to_write)
321+
assert lib.read(sym).metadata == expected_metadata

python/tests/unit/arcticdb/version_store/test_normalization.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,14 @@
1212
import sys
1313
from collections import namedtuple
1414
from unittest.mock import patch, MagicMock
15+
1516
import numpy as np
1617
import pandas as pd
1718
import dateutil as du
1819
import pytest
1920
import pytz
21+
if sys.version_info >= (3, 9):
22+
import zoneinfo
2023
from numpy.testing import assert_equal, assert_array_equal
2124
from arcticdb_ext.version_store import SortedValue as _SortedValue
2225

@@ -49,7 +52,7 @@
4952
from arcticdb.util._versions import IS_PANDAS_ZERO, IS_PANDAS_TWO
5053
from arcticdb.exceptions import ArcticNativeException
5154

52-
from tests.util.mark import param_dict
55+
from tests.util.mark import param_dict, ZONE_INFO_MARK
5356

5457
params = {
5558
"simple_dict": {"a": "1", "b": 2, "c": 3.0, "d": True},
@@ -238,19 +241,35 @@ def test_empty_df():
238241
assert_frame_equal(d, D)
239242

240243

244+
timezone_params = [
245+
"UTC",
246+
"Europe/Amsterdam",
247+
pytz.UTC,
248+
pytz.timezone("Europe/Amsterdam"),
249+
du.tz.gettz("UTC"),
250+
]
251+
if sys.version_info > (3, 9):
252+
timezone_params += [
253+
zoneinfo.ZoneInfo("UTC"),
254+
zoneinfo.ZoneInfo("Pacific/Kiritimati"),
255+
zoneinfo.ZoneInfo("America/Los_Angeles"),
256+
]
241257
# See test_get_description_date_range_tz in test_arctic.py for the V2 API equivalent
242-
@pytest.mark.parametrize(
243-
"tz", ["UTC", "Europe/Amsterdam", pytz.UTC, pytz.timezone("Europe/Amsterdam"), du.tz.gettz("UTC")]
244-
)
258+
@pytest.mark.parametrize("tz", timezone_params)
245259
def test_write_tz(lmdb_version_store, sym, tz):
246260
assert tz is not None
247-
index = index=pd.date_range(pd.Timestamp(0), periods=10, tz=tz)
261+
index = pd.date_range(pd.Timestamp(0), periods=10, tz=tz)
248262
df = pd.DataFrame(data={"col1": np.arange(10)}, index=index)
249263
lmdb_version_store.write(sym, df)
250264
result = lmdb_version_store.read(sym).data
251-
assert_frame_equal(df, result)
252265
df_tz = df.index.tzinfo
253266
assert str(df_tz) == str(tz)
267+
expected_df = df
268+
if sys.version_info >= (3, 9):
269+
if isinstance(tz, zoneinfo.ZoneInfo):
270+
# We convert all timezones to pytz, so we expect to convert to a pytz timezone.
271+
expected_df.index = expected_df.index.tz_convert(pytz.timezone(str(tz)))
272+
assert_frame_equal(expected_df, result)
254273
if tz == du.tz.gettz("UTC") and sys.version_info < (3, 7):
255274
pytest.skip("Timezone files don't seem to have ever worked properly on Python 3.6")
256275
start_ts, end_ts = lmdb_version_store.get_timerange_for_symbol(sym)

python/tests/util/mark.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@
7070
not MEMRAY_SUPPORTED, reason="MEMRAY supports linux and macos and python 3.8 and above"
7171
)
7272

73+
ZONE_INFO_MARK = pytest.mark.skipif(
74+
sys.version_info < (3, 9),
75+
reason="zoneinfo module was introduced in Python 3.9")
76+
7377
SSL_TESTS_MARK = pytest.mark.skipif(
7478
not SSL_TEST_SUPPORTED,
7579
reason="When SSL tests are enabled",

0 commit comments

Comments
 (0)