-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Make pd.read_hdf('data.h5') work when pandas object stored contained categorical columns #13359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
02f90d5
b3a5773
2f41aef
df10016
e7c8313
611aa28
e839638
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -331,11 +331,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): | |
|
||
try: | ||
if key is None: | ||
keys = store.keys() | ||
if len(keys) != 1: | ||
raise ValueError('key must be provided when HDF file contains ' | ||
'multiple datasets.') | ||
key = keys[0] | ||
groups = store.groups() | ||
if len(groups) == 0: | ||
raise ValueError('No dataset in HDF file.') | ||
candidate_only_group = groups[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this will fail on an empty store (and the error it raises will be odd, e.g. |
||
|
||
# For the HDF file to have only one dataset, all other groups | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line |
||
# should then be metadata groups for that candidate group. (This | ||
# assumes that the groups() method enumerates parent groups | ||
# before their children.) | ||
for group_to_check in groups[1:]: | ||
if not _is_metadata_of(group_to_check, candidate_only_group): | ||
raise ValueError('key must be provided when HDF file ' | ||
'contains multiple datasets.') | ||
key = candidate_only_group._v_pathname | ||
return store.select(key, auto_close=auto_close, **kwargs) | ||
except: | ||
# if there is an error, close the store | ||
|
@@ -347,6 +356,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): | |
raise | ||
|
||
|
||
def _is_metadata_of(group, parent_group): | ||
"""Check if a given group is a metadata group for a given parent_group.""" | ||
if group._v_depth <= parent_group._v_depth: | ||
return False | ||
|
||
current = group | ||
while current._v_depth > 1: | ||
parent = current._v_parent | ||
if parent == parent_group and current._v_name == 'meta': | ||
return True | ||
current = current._v_parent | ||
return False | ||
|
||
|
||
class HDFStore(StringMixin): | ||
|
||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,8 +46,8 @@ | |
|
||
from distutils.version import LooseVersion | ||
|
||
_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ | ||
and 'blosc' or 'zlib' | ||
_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' | ||
else 'zlib') | ||
|
||
_multiprocess_can_split_ = False | ||
|
||
|
@@ -4877,13 +4877,34 @@ def test_read_nokey(self): | |
df = DataFrame(np.random.rand(4, 5), | ||
index=list('abcd'), | ||
columns=list('ABCDE')) | ||
|
||
# Categorical dtype not supported for "fixed" format. So no need | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line |
||
# to test with that dtype in the dataframe here. | ||
with ensure_clean_path(self.path) as path: | ||
df.to_hdf(path, 'df', mode='a') | ||
reread = read_hdf(path) | ||
assert_frame_equal(df, reread) | ||
df.to_hdf(path, 'df2', mode='a') | ||
self.assertRaises(ValueError, read_hdf, path) | ||
|
||
def test_read_nokey_table(self): | ||
# GH13231 | ||
df = DataFrame({'i': range(5), | ||
'c': Series(list('abacd'), dtype='category')}) | ||
|
||
with ensure_clean_path(self.path) as path: | ||
df.to_hdf(path, 'df', mode='a', format='table') | ||
reread = read_hdf(path) | ||
assert_frame_equal(df, reread) | ||
df.to_hdf(path, 'df2', mode='a', format='table') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you really mean "add a blank line before the last assert (assertRaises) of test_read_nokey_table()?" Because then that with-block would not look the same as the pretty much identical one in the very similar test_read_nokey(). I'm assuming for consistency's sake that you meant a blank like before the with-block (like for test_read_nokey()). If not, please let me know. |
||
self.assertRaises(ValueError, read_hdf, path) | ||
|
||
def test_read_nokey_empty(self): | ||
with ensure_clean_path(self.path) as path: | ||
store = HDFStore(path) | ||
store.close() | ||
self.assertRaises(ValueError, read_hdf, path) | ||
|
||
def test_read_from_pathlib_path(self): | ||
|
||
# GH11773 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure that
ValueError
is the right exception here, but at least it's the same type as the exception raised by 0.18.1. (Although the message for 0.18.1 in this case is "key must be provided when HDF file contains multiple datasets.", which is a bit confusing.) And by the way, the exception raised when trying to dopd.read_hdf('empty.h5', 'some_key')
is (sensibly) "KeyError: 'No object named some_key in the file'". But raising KeyError for the case where key=None and we are trying to automatically figure out the (single, valid) key in the file seems wrong to me. Let me know if you can think of a better exception (or set of exceptions) for this.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is ok. The user is explicity trying to do something which doesn't work. Its the same type of error when you have multiple keys, so its consistent. (and ok)