Skip to content

BUG: fix HDFStore iterator to handle a where properly (GH8014) #8029

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 16, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ Bug Fixes
- Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity
when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`)


- Bug in HDFStore iteration when passing a where (:issue:`8014`)

- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)

Expand Down
103 changes: 62 additions & 41 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,21 +662,18 @@ def select(self, key, where=None, start=None, stop=None, columns=None,
s = self._create_storer(group)
s.infer_axes()

# what we are actually going to do for a chunk
def func(_start, _stop):
return s.read(where=where, start=_start, stop=_stop,
# function to call on iteration
def func(_start, _stop, _where):
return s.read(start=_start, stop=_stop,
where=_where,
columns=columns, **kwargs)

if iterator or chunksize is not None:
if not s.is_table:
raise TypeError(
"can only use an iterator or chunksize on a table")
return TableIterator(self, func, nrows=s.nrows, start=start,
stop=stop, chunksize=chunksize,
auto_close=auto_close)
# create the iterator
it = TableIterator(self, s, func, where=where, nrows=s.nrows, start=start,
stop=stop, iterator=iterator, chunksize=chunksize,
auto_close=auto_close)

return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop,
auto_close=auto_close).get_values()
return it.get_result()

def select_as_coordinates(
self, key, where=None, start=None, stop=None, **kwargs):
Expand Down Expand Up @@ -779,26 +776,22 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
# axis is the concentation axes
axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]

def func(_start, _stop):
if where is not None:
c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs)
else:
c = None
def func(_start, _stop, _where):

objs = [t.read(where=c, start=_start, stop=_stop,
columns=columns, **kwargs) for t in tbls]
# retrieve the objs, _where is always passed as a set of coordinates here
objs = [t.read(where=_where, columns=columns, **kwargs) for t in tbls]

# concat and return
return concat(objs, axis=axis,
verify_integrity=False).consolidate()

if iterator or chunksize is not None:
return TableIterator(self, func, nrows=nrows, start=start,
stop=stop, chunksize=chunksize,
auto_close=auto_close)
# create the iterator
it = TableIterator(self, s, func, where=where, nrows=nrows, start=start,
stop=stop, iterator=iterator, chunksize=chunksize,
auto_close=auto_close)

return it.get_result(coordinates=True)

return TableIterator(self, func, nrows=nrows, start=start, stop=stop,
auto_close=auto_close).get_values()

def put(self, key, value, format=None, append=False, **kwargs):
"""
Expand Down Expand Up @@ -1293,57 +1286,85 @@ class TableIterator(object):
----------

store : the reference store
func : the function to get results
s : the refered storer
func : the function to execute the query
where : the where of the query
nrows : the rows to iterate on
start : the passed start value (default is None)
stop : the passed stop value (default is None)
chunksize : the passed chunking valeu (default is 50000)
iterator : boolean, whether to use the default iterator
chunksize : the passed chunking value (default is 50000)
auto_close : boolean, automatically close the store at the end of
iteration, default is False
kwargs : the passed kwargs
"""

def __init__(self, store, func, nrows, start=None, stop=None,
chunksize=None, auto_close=False):
def __init__(self, store, s, func, where, nrows, start=None, stop=None,
iterator=False, chunksize=None, auto_close=False):
self.store = store
self.func = func
self.s = s
self.func = func
self.where = where
self.nrows = nrows or 0
self.start = start or 0

if stop is None:
stop = self.nrows
self.stop = min(self.nrows, stop)

if chunksize is None:
chunksize = 100000
self.coordinates = None
if iterator or chunksize is not None:
if chunksize is None:
chunksize = 100000
self.chunksize = int(chunksize)
else:
self.chunksize = None

self.chunksize = chunksize
self.auto_close = auto_close

def __iter__(self):

# iterate
current = self.start
while current < self.stop:
stop = current + self.chunksize
v = self.func(current, stop)
current = stop

if v is None:
stop = min(current + self.chunksize, self.stop)
value = self.func(None, None, self.coordinates[current:stop])
current = stop
if value is None or not len(value):
continue

yield v
yield value

self.close()

def close(self):
if self.auto_close:
self.store.close()

def get_values(self):
results = self.func(self.start, self.stop)
def get_result(self, coordinates=False):

# return the actual iterator
if self.chunksize is not None:
if not self.s.is_table:
raise TypeError(
"can only use an iterator or chunksize on a table")

self.coordinates = self.s.read_coordinates(where=self.where)

return self

# if specified read via coordinates (necessary for multiple selections
if coordinates:
where = self.s.read_coordinates(where=self.where)
else:
where = self.where

# directly return the result
results = self.func(self.start, self.stop, where)
self.close()
return results


class IndexCol(StringMixin):

""" an index column description class
Expand Down
Loading