Skip to content

Commit 053e7be

Browse files
authored
fix #788 : convert unicode string to bytes and reversely when dumping/loading Axis and Group objects to/from HDF files (to avoid huge size hdf files)
1 parent ecbb3c7 commit 053e7be

File tree

6 files changed

+33
-4
lines changed

6 files changed

+33
-4
lines changed

larray/core/axis.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1349,10 +1349,13 @@ def to_hdf(self, filepath, key=None):
13491349
raise ValueError("Argument key must be provided explicitly in case of anonymous axis")
13501350
key = self.name
13511351
key = _translate_group_key_hdf(key)
1352-
s = pd.Series(data=self.labels, name=self.name)
1352+
dtype_kind = self.labels.dtype.kind
1353+
data = np.char.encode(self.labels, 'utf-8') if dtype_kind == 'U' else self.labels
1354+
s = pd.Series(data=data, name=self.name)
13531355
with LHDFStore(filepath) as store:
13541356
store.put(key, s)
13551357
store.get_storer(key).attrs.type = 'Axis'
1358+
store.get_storer(key).attrs.dtype_kind = dtype_kind
13561359
store.get_storer(key).attrs.wildcard = self.iswildcard
13571360

13581361
@property

larray/core/group.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1462,10 +1462,15 @@ def to_hdf(self, filepath, key=None, axis_key=None):
14621462
if self.axis.name is None:
14631463
raise ValueError("Argument axis_key must be provided explicitly if the associated axis is anonymous")
14641464
axis_key = self.axis.name
1465-
s = pd.Series(data=self.eval(), name=self.name)
1465+
data = self.eval()
1466+
dtype_kind = data.dtype.kind if isinstance(data, np.ndarray) else ''
1467+
if dtype_kind == 'U':
1468+
data = np.char.encode(data, 'utf-8')
1469+
s = pd.Series(data=data, name=self.name)
14661470
with LHDFStore(filepath) as store:
14671471
store.put(key, s)
14681472
store.get_storer(key).attrs.type = 'Group'
1473+
store.get_storer(key).attrs.dtype_kind = dtype_kind
14691474
if axis_key not in store:
14701475
self.axis.to_hdf(store, key=axis_key)
14711476
store.get_storer(key).attrs.axis_key = axis_key

larray/inout/hdf.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -88,15 +88,21 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s
8888
name = str(pd_obj.name)
8989
if name == 'None':
9090
name = None
91-
res = Axis(labels=pd_obj.values, name=name)
91+
labels = pd_obj.values
92+
if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
93+
labels = np.char.decode(labels, 'utf-8')
94+
res = Axis(labels=labels, name=name)
9295
res._iswildcard = attrs['wildcard']
9396
elif _type == 'Group':
9497
if name is None:
9598
name = str(pd_obj.name)
9699
if name == 'None':
97100
name = None
101+
key = pd_obj.values
102+
if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
103+
key = np.char.decode(key, 'utf-8')
98104
axis = read_hdf(filepath_or_buffer, attrs['axis_key'])
99-
res = LGroup(key=pd_obj.values, name=name, axis=axis)
105+
res = LGroup(key=key, name=name, axis=axis)
100106
return res
101107

102108

larray/tests/test_array.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- coding: utf8 -*-
12
from __future__ import absolute_import, division, print_function
23

34
import os

larray/tests/test_axis.py

+7
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
# -*- coding: utf8 -*-
12
from __future__ import absolute_import, division, print_function
3+
24
import pytest
35
import os.path
46
import numpy as np
@@ -391,6 +393,7 @@ def test_h5_io(tmpdir):
391393
lipro = Axis('lipro=P01..P05')
392394
anonymous = Axis(range(3))
393395
wildcard = Axis(3, 'wildcard')
396+
string_axis = Axis(['@!àéè&%µ$~', '/*-+_§()><', 'another label'], 'string_axis')
394397
fpath = os.path.join(str(tmpdir), 'axes.h5')
395398

396399
# ---- default behavior ----
@@ -410,6 +413,10 @@ def test_h5_io(tmpdir):
410413
wildcard2 = read_hdf(fpath, key=wildcard.name)
411414
assert wildcard2.iswildcard
412415
assert wildcard.equals(wildcard2)
416+
# string axis
417+
string_axis.to_hdf(fpath)
418+
string_axis2 = read_hdf(fpath, string_axis.name)
419+
assert string_axis.equals(string_axis2)
413420

414421
# ---- specific key ----
415422
# int axis

larray/tests/test_group.py

+7
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
# -*- coding: utf8 -*-
12
from __future__ import absolute_import, division, print_function
3+
24
import pytest
35
import os.path
46
import numpy as np
@@ -192,6 +194,7 @@ def test_h5_io_lgroup(tmpdir):
192194
named_axis_not_in_file = lipro['P01,P03,P05'] >> 'P_odd'
193195
anonymous = age[':5']
194196
wildcard = age_wildcard[':5'] >> 'age_w_05'
197+
string_group = Axis(['@!àéè&%µ$~', '/*-+_§()><', 'another label'], 'string_axis')[:] >> 'string_group'
195198

196199
# ---- default behavior ----
197200
# named group
@@ -209,6 +212,10 @@ def test_h5_io_lgroup(tmpdir):
209212
named_axis_not_in_file.to_hdf(fpath)
210213
named2 = read_hdf(fpath, key=named_axis_not_in_file.name)
211214
assert all(named_axis_not_in_file == named2)
215+
# string group
216+
string_group.to_hdf(fpath)
217+
string_group2 = read_hdf(fpath, key=string_group.name)
218+
assert all(string_group == string_group2)
212219

213220
# ---- specific hdf group + key ----
214221
hdf_group = 'my_groups'

0 commit comments

Comments
 (0)