Skip to content

Commit 0668f34

Browse files
[backport 3.0.x] BUG: Fix HDFStore.put with StringDtype columns and compression (#64180) (#64569) (#64922)
Co-authored-by: Alex Lubbock <code@alexlubbock.com>
1 parent 23f2f44 commit 0668f34

File tree

3 files changed

+81
-64
lines changed

3 files changed

+81
-64
lines changed

doc/source/whatsnew/v3.0.2.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Bug fixes
3535
- Bug when using :func:`col` with Python functions :func:`bool`, :func:`iter`, :func:`copy`, and :func:`deepcopy` either failed or produced incorrect results; these now all raise a ``TypeError`` (:issue:`64267`)
3636
- Fixed bug in :func:`to_datetime` that could give an unnecessary ``RuntimeWarning`` when converting DataFrame containing missing values (:issue:`64141`)
3737
- Fixed bug in :meth:`Series.var` computing the variance of complex numbers incorrectly (:issue:`62421`)
38+
- Fixed bug in :meth:`~DataFrame.to_hdf` with string columns raising an error when using compression (:issue:`64180`)
3839
- Fixed bug in the :meth:`~Series.sum` method with python-backed string dtype returning incorrect value for an empty Series and ignoring the ``min_count`` argument (:issue:`64683`)
3940
- Fixed bug where :meth:`DataFrame.div` ignored the ``axis`` argument when used with ``level`` for MultiIndex columns (:issue:`64428`)
4041

pandas/io/pytables.py

Lines changed: 68 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -3261,10 +3261,6 @@ def write_array_empty(self, key: str, value: ArrayLike) -> None:
32613261
def write_array(
32623262
self, key: str, obj: AnyArrayLike, items: Index | None = None
32633263
) -> None:
3264-
# TODO: we only have a few tests that get here, the only EA
3265-
# that gets passed is DatetimeArray, and we never have
3266-
# both self._filters and EA
3267-
32683264
value = extract_array(obj, extract_numpy=True)
32693265

32703266
if key in self.group:
@@ -3285,71 +3281,79 @@ def write_array(
32853281
value = value.T
32863282
transposed = True
32873283

3288-
atom = None
3289-
if self._filters is not None:
3290-
with suppress(ValueError):
3291-
# get the atom for this datatype
3292-
atom = _tables().Atom.from_dtype(value.dtype)
3293-
3294-
if atom is not None:
3295-
# We only get here if self._filters is non-None and
3296-
# the Atom.from_dtype call succeeded
3297-
3298-
# create an empty chunked array and fill it from value
3299-
if not empty_array:
3300-
ca = self._handle.create_carray(
3301-
self.group, key, atom, value.shape, filters=self._filters
3302-
)
3303-
ca[:] = value
3304-
3305-
else:
3306-
self.write_array_empty(key, value)
3307-
3308-
elif value.dtype.type == np.object_:
3309-
# infer the type, warn if we have a non-string type here (for
3310-
# performance)
3311-
inferred_type = lib.infer_dtype(value, skipna=False)
3312-
if empty_array:
3313-
pass
3314-
elif inferred_type == "string":
3315-
pass
3316-
elif get_option("performance_warnings"):
3317-
ws = performance_doc % (inferred_type, key, items)
3318-
warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
3319-
3320-
vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
3321-
vlarr.append(value)
3322-
3323-
elif lib.is_np_dtype(value.dtype, "M"):
3324-
self._handle.create_array(self.group, key, value.view("i8"))
3325-
getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
3326-
elif isinstance(value.dtype, DatetimeTZDtype):
3327-
# store as UTC
3328-
# with a zone
3329-
3330-
# error: "ExtensionArray" has no attribute "asi8"
3331-
self._handle.create_array(
3332-
self.group,
3333-
key,
3334-
value.asi8, # type: ignore[attr-defined]
3284+
if isinstance(value, BaseStringArray):
3285+
# GH#64180: BaseStringArray must use the VLArray path.
3286+
# Atom.from_dtype does not handle ExtensionDtype.
3287+
vlarr = self._handle.create_vlarray(
3288+
self.group, key, _tables().ObjectAtom(), filters=self._filters
33353289
)
3336-
3337-
node = getattr(self.group, key)
3338-
# error: "ExtensionArray" has no attribute "tz"
3339-
node._v_attrs.tz = _get_tz(value.tz) # type: ignore[attr-defined]
3340-
node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"
3341-
elif lib.is_np_dtype(value.dtype, "m"):
3342-
self._handle.create_array(self.group, key, value.view("i8"))
3343-
getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
3344-
elif isinstance(value, BaseStringArray):
3345-
vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
33463290
vlarr.append(value.to_numpy())
33473291
node = getattr(self.group, key)
33483292
node._v_attrs.value_type = str(value.dtype)
3349-
elif empty_array:
3350-
self.write_array_empty(key, value)
3293+
33513294
else:
3352-
self._handle.create_array(self.group, key, value)
3295+
atom = None
3296+
if self._filters is not None:
3297+
with suppress(ValueError):
3298+
# get the atom for this datatype
3299+
atom = _tables().Atom.from_dtype(value.dtype)
3300+
3301+
if atom is not None:
3302+
# We only get here if self._filters is non-None and
3303+
# the Atom.from_dtype call succeeded
3304+
3305+
# create an empty chunked array and fill it from value
3306+
if not empty_array:
3307+
ca = self._handle.create_carray(
3308+
self.group, key, atom, value.shape, filters=self._filters
3309+
)
3310+
ca[:] = value
3311+
3312+
else:
3313+
self.write_array_empty(key, value)
3314+
3315+
elif value.dtype.type == np.object_:
3316+
# infer the type, warn if we have a non-string type here
3317+
# (for performance)
3318+
inferred_type = lib.infer_dtype(value, skipna=False)
3319+
if empty_array:
3320+
pass
3321+
elif inferred_type == "string":
3322+
pass
3323+
elif get_option("performance_warnings"):
3324+
ws = performance_doc % (inferred_type, key, items)
3325+
warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
3326+
3327+
vlarr = self._handle.create_vlarray(
3328+
self.group, key, _tables().ObjectAtom()
3329+
)
3330+
vlarr.append(value)
3331+
3332+
elif lib.is_np_dtype(value.dtype, "M"):
3333+
self._handle.create_array(self.group, key, value.view("i8"))
3334+
getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
3335+
elif isinstance(value.dtype, DatetimeTZDtype):
3336+
# store as UTC
3337+
# with a zone
3338+
3339+
# error: "ExtensionArray" has no attribute "asi8"
3340+
self._handle.create_array(
3341+
self.group,
3342+
key,
3343+
value.asi8, # type: ignore[attr-defined]
3344+
)
3345+
3346+
node = getattr(self.group, key)
3347+
# error: "ExtensionArray" has no attribute "tz"
3348+
node._v_attrs.tz = _get_tz(value.tz) # type: ignore[attr-defined]
3349+
node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"
3350+
elif lib.is_np_dtype(value.dtype, "m"):
3351+
self._handle.create_array(self.group, key, value.view("i8"))
3352+
getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
3353+
elif empty_array:
3354+
self.write_array_empty(key, value)
3355+
else:
3356+
self._handle.create_array(self.group, key, value)
33533357

33543358
getattr(self.group, key)._v_attrs.transposed = transposed
33553359

pandas/tests/io/pytables/test_put.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,18 @@ def test_put_str_series(temp_hdfstore, performance_warning, string_dtype_argumen
243243
tm.assert_series_equal(result, expected)
244244

245245

246+
def test_put_str_frame_complevel(temp_hdfstore, string_dtype_arguments):
247+
# GH#64180 - writing StringDtype columns to HDFStore with fixed format + complevel
248+
dtype = pd.StringDtype(*string_dtype_arguments)
249+
df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype), "b": [1, 2, 3]})
250+
temp_hdfstore.put("df", df, complevel=1)
251+
expected_dtype = "str" if dtype.na_value is np.nan else "string"
252+
expected = df.copy()
253+
expected["a"] = expected["a"].astype(expected_dtype)
254+
result = temp_hdfstore.get("df")
255+
tm.assert_frame_equal(result, expected)
256+
257+
246258
@pytest.mark.parametrize("format", ["table", "fixed"])
247259
@pytest.mark.parametrize(
248260
"index",

0 commit comments

Comments
 (0)