From ae17462d54e54f76e756831ead918edd525bda9c Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 11 Dec 2012 16:24:20 -0500 Subject: [PATCH 01/13] ENH: ndim tables in HDFStore changes axes_to_index keyword in table creation to axes allow passing of numeric or named axes (e.g. 0 or 'minor_axis') in axes create_axes now checks for current table scheme; raises if this indexing scheme is violated added many p4d tests for appending/selection/partial selection/and axis permuation added addition Term tests to include p4d add __eq__ operators to IndexCol/DataCol/Table to comparisons updated docs with Panel4D saving & issues relating to threading supporting non-regular indexables: e.g. can index a Panel4D on say [labels,major_axis,minor_axis], rather than the default of [items,major_axis,minor_axis] support column oriented DataFrames (e.g. queryable by the columns) --- doc/source/io.rst | 25 ++- pandas/io/pytables.py | 257 +++++++++++++++++++++++-------- pandas/io/tests/test_pytables.py | 189 +++++++++++++++++------ 3 files changed, 356 insertions(+), 115 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index bbf473628cafb..336634308c5bc 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1140,15 +1140,17 @@ Delete from a Table .. ipython:: python + # returns the number of rows deleted store.remove('wp', 'major_axis>20000102' ) store.select('wp') Notes & Caveats ~~~~~~~~~~~~~~~ - - Selection by items (the top level panel dimension) is not possible; you always get all of the items in the returned Panel - Once a ``table`` is created its items (Panel) / columns (DataFrame) are fixed; only exactly the same columns can be appended - You can not append/select/delete to a non-table (table creation is determined on the first append, or by passing ``table=True`` in a put operation) + - ``HDFStore`` is **not-threadsafe for writing**. The underlying ``PyTables`` only supports concurrent reads (via threading or processes). If you need reading and writing *at the same time*, you need to serialize these operations in a single thread in a single process. You will corrupt your data otherwise. See the issue for more information. + - ``PyTables`` only supports fixed-width string columns in ``tables``. The sizes of a string based indexing column (e.g. *column* or *minor_axis*) are determined as the maximum size of the elements in that axis or by passing the parameter ``min_itemsize`` on the first table creation (``min_itemsize`` can be an integer or a dict of column name to an integer). If subsequent appends introduce elements in the indexing axis that are larger than the supported indexer, an Exception will be raised (otherwise you could have a silent truncation of these indexers, leading to loss of information). This is **ONLY** necessary for storing ``Panels`` (as the indexing column is stored directly in a column) .. ipython:: python @@ -1183,6 +1185,27 @@ Performance use the pytables utilities ``ptrepack`` to rewrite the file (and also can change compression methods) - Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) +Experimental +~~~~~~~~~~~~ + +HDFStore supports ``Panel4D`` storage. + +.. ipython:: python + + p4d = Panel4D({ 'l1' : wp }) + p4d + store.append('p4d', p4d) + store + +These, by default, index the three axes ``items, major_axis, minor_axis``. On an ``AppendableTable`` it is possible to setup with the first append a different indexing scheme, depending on how you want to store your data. Pass the ``axes`` keyword with a list of dimension (currently must by exactly 1 less than the total dimensions of the object). This cannot be changed after table creation. + +.. ipython:: python + + from pandas.io.pytables import Term + store.append('p4d2', p4d, axes = ['labels','major_axis','minor_axis']) + store + store.select('p4d2', [ Term('labels=l1'), Term('items=Item1'), Term('minor_axis=A_big_strings') ]) + .. ipython:: python :suppress: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 86627563854b3..e5d0dd76de5a7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -390,7 +390,7 @@ def remove(self, key, where=None): if group is not None: # remove the node - if where is None or not len(where): + if where is None: group = self.get_node(key) group._f_remove(recursive=True) @@ -617,10 +617,6 @@ def _read_block_manager(self, group): return BlockManager(blocks, axes) - def _write_frame_table(self, group, df, append=False, comp=None, **kwargs): - t = create_table(self, group, typ = 'appendable_frame') - t.write(axes_to_index=[0], obj=df, append=append, compression=comp, **kwargs) - def _write_wide(self, group, panel): panel._consolidate_inplace() self._write_block_manager(group, panel._data) @@ -628,23 +624,33 @@ def _write_wide(self, group, panel): def _read_wide(self, group, where=None): return Panel(self._read_block_manager(group)) - def _write_wide_table(self, group, panel, append=False, comp=None, **kwargs): - t = create_table(self, group, typ = 'appendable_panel') - t.write(axes_to_index=[1,2], obj=panel, - append=append, compression=comp, **kwargs) - - def _write_ndim_table(self, group, obj, append=False, comp=None, axes_to_index=None, **kwargs): - if axes_to_index is None: - axes_to_index=[1,2,3] + def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, **kwargs): + if axes is None: + axes = [1,2,3] t = create_table(self, group, typ = 'appendable_ndim') - t.write(axes_to_index=axes_to_index, obj=obj, + t.write(axes=axes, obj=obj, append=append, compression=comp, **kwargs) - def _read_wide_table(self, group, where=None): + def _read_ndim_table(self, group, where=None): t = create_table(self, group) return t.read(where) - _read_ndim_table = _read_wide_table + def _write_frame_table(self, group, df, append=False, comp=None, axes=None, **kwargs): + if axes is None: + axes = [0] + t = create_table(self, group, typ = 'appendable_frame') + t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs) + + _read_frame_table = _read_ndim_table + + def _write_wide_table(self, group, panel, append=False, comp=None, axes=None, **kwargs): + if axes is None: + axes = [1,2] + t = create_table(self, group, typ = 'appendable_panel') + t.write(axes=axes, obj=panel, + append=append, compression=comp, **kwargs) + + _read_wide_table = _read_ndim_table def _write_index(self, group, key, index): if isinstance(index, MultiIndex): @@ -860,11 +866,6 @@ def _read_index_legacy(self, group, key): kind = node._v_attrs.kind return _unconvert_index_legacy(data, kind) - def _read_frame_table(self, group, where=None): - t = create_table(self, group) - return t.read(where) - - class IndexCol(object): """ an index column description class @@ -928,6 +929,10 @@ def __repr__(self): __str__ = __repr__ + def __eq__(self, other): + """ compare 2 col items """ + return all([ getattr(self,a,None) == getattr(other,a,None) for a in ['name','cname','axis','pos'] ]) + def copy(self): new_self = copy.copy(self) return new_self @@ -1034,6 +1039,10 @@ def __init__(self, values = None, kind = None, typ = None, cname = None, data = def __repr__(self): return "name->%s,cname->%s,dtype->%s,shape->%s" % (self.name,self.cname,self.dtype,self.shape) + def __eq__(self, other): + """ compare 2 col items """ + return all([ getattr(self,a,None) == getattr(other,a,None) for a in ['name','cname','dtype','pos'] ]) + def set_data(self, data): self.data = data if data is not None: @@ -1129,10 +1138,25 @@ def pandas_type(self): def __repr__(self): """ return a pretty representatgion of myself """ - return "%s (typ->%s,nrows->%s)" % (self.pandas_type,self.table_type_short,self.nrows) + self.infer_axes() + return "%s (typ->%s,nrows->%s,indexers->[%s])" % (self.pandas_type,self.table_type_short,self.nrows,','.join([ a.name for a in self.index_axes ])) __str__ = __repr__ + def copy(self): + new_self = copy.copy(self) + return new_self + + def __eq__(self, other): + """ return True if we are 'equal' to this other table (in all respects that matter) """ + for c in ['index_axes','non_index_axes','values_axes']: + if getattr(self,c,None) != getattr(other,c,None): + return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + @property def nrows(self): return getattr(self.table,'nrows',None) @@ -1178,6 +1202,15 @@ def description(self): def axes(self): return itertools.chain(self.index_axes, self.values_axes) + @property + def is_transposed(self): + return False + + @property + def data_orientation(self): + """ return a tuple of my permutated axes, non_indexable at the front """ + return tuple(itertools.chain([ a[0] for a in self.non_index_axes ], [ a.axis for a in self.index_axes ])) + def queryables(self): """ return a dict of the kinds allowable columns for this object """ return dict([ (a.cname,a.kind) for a in self.index_axes ] + [ (self.obj_type._AXIS_NAMES[axis],None) for axis, values in self.non_index_axes ]) @@ -1243,10 +1276,7 @@ def create_index(self, columns = None, optlevel = None, kind = None): """ - table = self.table - if table is None: return - - self.infer_axes() + if not self.infer_axes(): return if columns is None: columns = [ self.index_axes[0].name ] @@ -1259,16 +1289,17 @@ def create_index(self, columns = None, optlevel = None, kind = None): if kind is not None: kw['kind'] = kind + table = self.table for c in columns: v = getattr(table.cols,c,None) if v is not None and not v.is_indexed: v.createIndex(**kw) def read_axes(self, where): - """ create and return the axes sniffed from the table """ + """ create and return the axes sniffed from the table: return boolean for success """ # infer the data kind - self.infer_axes() + if not self.infer_axes(): return False # create the selection self.selection = Selection(self, where) @@ -1278,25 +1309,54 @@ def read_axes(self, where): for a in self.axes: a.convert(self.selection) + return True + def infer_axes(self): - """ infer the axes from the indexables """ + """ infer the axes from the indexables: + return a boolean indicating if we have a valid table or not """ + + table = self.table + if table is None: + return False + self.index_axes, self.values_axes = [ a.infer(self.table) for a in self.indexables if a.is_indexable ], [ a.infer(self.table) for a in self.indexables if not a.is_indexable ] - self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] + self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] + + return True - def create_axes(self, axes_to_index, obj, validate = True, min_itemsize = None): + def get_data_blocks(self, obj): + """ return the data blocks for this obj """ + return obj._data.blocks + + def create_axes(self, axes, obj, validate = True, min_itemsize = None): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields """ - self.index_axes = [] - self.non_index_axes = [] + # map axes to numbers + axes = set([ obj._get_axis_number(a) for a in axes ]) + + # do we have an existing table (if so, use its axes)? + if self.infer_axes(): + existing_table = self.copy() + axes = [ a.axis for a in existing_table.index_axes] + else: + existing_table = None + + # currently support on ndim-1 axes + if len(axes) != self.ndim-1: + raise Exception("currenctly only support ndim-1 indexers in an AppendableTable") + + # create according to the new data + self.index_axes = [] + self.non_index_axes = [] # create axes to index and non_index j = 0 for i, a in enumerate(obj.axes): - if i in axes_to_index: + if i in axes: name = obj._AXIS_NAMES[i] self.index_axes.append(_convert_index(a).set_name(name).set_axis(i).set_pos(j)) j += 1 @@ -1309,17 +1369,23 @@ def create_axes(self, axes_to_index, obj, validate = True, min_itemsize = None): for a in self.axes: a.maybe_set_size(min_itemsize = min_itemsize) + + blocks = self.get_data_blocks(obj) + # add my values self.values_axes = [] - for i, b in enumerate(obj._data.blocks): + for i, b in enumerate(blocks): + + # shape of the data column are the indexable axes + shape = b.shape[0] values = b.values # a string column if b.dtype.name == 'object': - atom = _tables().StringCol(itemsize = values.dtype.itemsize, shape = b.shape[0]) + atom = _tables().StringCol(itemsize = values.dtype.itemsize, shape = shape) utype = 'S8' else: - atom = getattr(_tables(),"%sCol" % b.dtype.name.capitalize())(shape = b.shape[0]) + atom = getattr(_tables(),"%sCol" % b.dtype.name.capitalize())(shape = shape) utype = atom._deftype # coerce data to this type @@ -1332,6 +1398,11 @@ def create_axes(self, axes_to_index, obj, validate = True, min_itemsize = None): j += 1 self.values_axes.append(dc) + # validate the axes if we have an existing table + if existing_table is not None: + if self != existing_table: + raise Exception("try to write axes [%s] that are invalid to an existing table [%s]!" % (axes,self.group)) + def create_description(self, compression = None, complevel = None): """ create the description of the table from the axes & values """ @@ -1392,10 +1463,11 @@ class LegacyTable(Table): that can be easily searched """ - _indexables = [IndexCol(name = 'index', axis = 0, pos = 0), - IndexCol(name = 'column', axis = 1, pos = 1, index_kind = 'columns_kind'), + _indexables = [IndexCol(name = 'index', axis = 1, pos = 0), + IndexCol(name = 'column', axis = 2, pos = 1, index_kind = 'columns_kind'), DataCol( name = 'fields', cname = 'values', kind_attr = 'fields', pos = 2) ] table_type = 'legacy' + ndim = 3 def write(self, **kwargs): raise Exception("write operations are not allowed on legacy tables!") @@ -1403,7 +1475,7 @@ def write(self, **kwargs): def read(self, where=None): """ we have n indexable columns, with an arbitrary number of data axes """ - self.read_axes(where) + if not self.read_axes(where): return None indicies = [ i.values for i in self.index_axes ] factors = [ Factor.from_array(i) for i in indicies ] @@ -1428,10 +1500,17 @@ def read(self, where=None): take_labels = [ l.take(sorter) for l in labels ] items = Index(c.values) + block = block2d_to_blocknd(sorted_values, items, tuple(N), take_labels) - block = block2d_to_blocknd(sorted_values, items, tuple(N), take_labels) + # create the object mgr = BlockManager([block], [items] + levels) - objs.append(self.obj_type(mgr)) + obj = self.obj_type(mgr) + + # permute if needed + if self.is_transposed: + obj = obj.transpose(*self.data_orientation) + + objs.append(obj) else: if not self._quiet: # pragma: no cover @@ -1466,9 +1545,12 @@ def read(self, where=None): for axis,labels in self.non_index_axes: wp = wp.reindex_axis(labels,axis=axis,copy=False) + # apply the selection filters (but keep in the same order) if self.selection.filter: - new_minor = sorted(set(wp.minor_axis) & self.selection.filter) - wp = wp.reindex(minor=new_minor, copy = False) + filter_axis_name = wp._get_axis_name(self.non_index_axes[0][0]) + ordered = getattr(wp,filter_axis_name) + new_axis = sorted(ordered & self.selection.filter) + wp = wp.reindex(**{ filter_axis_name : new_axis, 'copy' : False }) return wp @@ -1489,7 +1571,7 @@ class AppendableTable(LegacyTable): _indexables = None table_type = 'appendable' - def write(self, axes_to_index, obj, append=False, compression=None, + def write(self, axes, obj, append=False, compression=None, complevel=None, min_itemsize = None, **kwargs): # create the table if it doesn't exist (or get it if it does) @@ -1498,7 +1580,7 @@ def write(self, axes_to_index, obj, append=False, compression=None, self.handle.removeNode(self.group, 'table') # create the axes - self.create_axes(axes_to_index = axes_to_index, obj = obj, validate = append, min_itemsize = min_itemsize) + self.create_axes(axes = axes, obj = obj, validate = append, min_itemsize = min_itemsize) if 'table' not in self.group: @@ -1530,9 +1612,9 @@ def write(self, axes_to_index, obj, append=False, compression=None, def write_data(self): """ fast writing of data: requires specific cython routines each axis shape """ + # create the masks & values + #import pdb; pdb.set_trace() masks = [] - - # create the masks for a in self.values_axes: # figure the mask: only do if we can successfully process this column, otherwise ignore the mask @@ -1549,7 +1631,7 @@ def write_data(self): for m in masks[1:]: m = mask & m - # the arguments & values + # the arguments args = [ a.cvalues for a in self.index_axes ] values = [ a.data for a in self.values_axes ] @@ -1562,17 +1644,22 @@ def write_data(self): if len(rows): self.table.append(rows) except (Exception), detail: + #import pdb; pdb.set_trace() raise Exception("tables cannot write this data -> %s" % str(detail)) def delete(self, where = None): - if where is None: - return super(LegacyTable, self).delete() + + # delete all rows (and return the nrows) + if where is None or not len(where): + nrows = self.nrows + self.handle.removeNode(self.group, recursive=True) + return nrows # infer the data kind - table = self.table - self.infer_axes() + if not self.infer_axes(): return None # create the selection + table = self.table self.selection = Selection(self, where) self.selection.select_coords() @@ -1603,32 +1690,53 @@ class AppendableFrameTable(AppendableTable): ndim = 2 obj_type = DataFrame + @property + def is_transposed(self): + return self.index_axes[0].axis == 1 + + def get_data_blocks(self, obj): + """ these are written transposed """ + if self.is_transposed: + obj = obj.T + return obj._data.blocks + def read(self, where=None): - self.read_axes(where) + if not self.read_axes(where): return None index = Index(self.index_axes[0].values) frames = [] for a in self.values_axes: columns = Index(a.values) - block = make_block(a.cvalues.T, columns, columns) - mgr = BlockManager([ block ], [ columns, index ]) + + if self.is_transposed: + values = a.cvalues + index_ = columns + columns_ = index + else: + values = a.cvalues.T + index_ = index + columns_ = columns + + block = make_block(values, columns_, columns_) + mgr = BlockManager([ block ], [ columns_, index_ ]) frames.append(DataFrame(mgr)) df = concat(frames, axis = 1, verify_integrity = True) # sort the indicies & reorder the columns for axis,labels in self.non_index_axes: df = df.reindex_axis(labels,axis=axis,copy=False) - columns_ordered = df.columns - # apply the column filter (but keep columns in the same order) - if self.selection.filter: - columns = Index(set(columns_ordered) & self.selection.filter) - columns = sorted(columns_ordered.get_indexer(columns)) - df = df.reindex(columns = columns_ordered.take(columns), copy = False) + # apply the selection filters (but keep in the same order) + filter_axis_name = df._get_axis_name(self.non_index_axes[0][0]) + ordered = getattr(df,filter_axis_name) + if self.selection.filter: + ordd = ordered & self.selection.filter + ordd = sorted(ordered.get_indexer(ordd)) + df = df.reindex(**{ filter_axis_name : ordered.take(ordd), 'copy' : False }) else: - df = df.reindex(columns = columns_ordered, copy = False) + df = df.reindex(**{ filter_axis_name : ordered , 'copy' : False }) return df @@ -1638,6 +1746,16 @@ class AppendablePanelTable(AppendableTable): ndim = 3 obj_type = Panel + def get_data_blocks(self, obj): + """ these are written transposed """ + if self.is_transposed: + obj = obj.transpose(*self.data_orientation) + return obj._data.blocks + + @property + def is_transposed(self): + return self.data_orientation != tuple(range(self.ndim)) + class AppendableNDimTable(AppendablePanelTable): """ suppor the new appendable table formats """ table_type = 'appendable_ndim' @@ -1957,8 +2075,11 @@ def eval(self): if not self.is_valid: raise Exception("query term is not valid [%s]" % str(self)) - # convert values - values = [ self.convert_value(v) for v in self.value ] + # convert values if we are in the table + if self.is_in_table: + values = [ self.convert_value(v) for v in self.value ] + else: + values = [ [v, v] for v in self.value ] # equality conditions if self.op in ['=','!=']: @@ -1983,6 +2104,10 @@ def eval(self): self.condition = '(%s %s %s)' % (self.field, self.op, values[0][0]) + else: + + raise Exception("passing a filterable condition to a non-table indexer [%s]" % str(self)) + def convert_value(self, v): #### a little hacky here, need to really figure out what we should convert ####x @@ -2052,7 +2177,7 @@ def select_coords(self): """ generate the selection """ - self.values = self.table.table.getWhereList(self.condition) + self.values = self.table.table.getWhereList(self.condition, sort = True) def _get_index_factory(klass): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 7ecb0bc2fd5ee..455182fc21e5e 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -158,50 +158,108 @@ def test_put_integer(self): self._check_roundtrip(df, tm.assert_frame_equal) def test_append(self): - pth = '__test_append__.h5' - try: - store = HDFStore(pth) - - df = tm.makeTimeDataFrame() - store.append('df1', df[:10]) - store.append('df1', df[10:]) - tm.assert_frame_equal(store['df1'], df) - - store.put('df2', df[:10], table=True) - store.append('df2', df[10:]) - tm.assert_frame_equal(store['df2'], df) - - store.append('/df3', df[:10]) - store.append('/df3', df[10:]) - tm.assert_frame_equal(store['df3'], df) - - # this is allowed by almost always don't want to do it - import warnings - import tables - warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) - store.append('/df3 foo', df[:10]) - store.append('/df3 foo', df[10:]) - tm.assert_frame_equal(store['df3 foo'], df) - warnings.filterwarnings('always', category=tables.NaturalNameWarning) - - # panel - wp = tm.makePanel() - store.append('wp1', wp.ix[:,:10,:]) - store.append('wp1', wp.ix[:,10:,:]) - tm.assert_panel_equal(store['wp1'], wp) - - # ndim - p4d = tm.makePanel4D() - store.append('p4d', p4d.ix[:,:,:10,:]) - store.append('p4d', p4d.ix[:,:,10:,:]) - tm.assert_panel4d_equal(store['p4d'], p4d) - - except: - raise - finally: - store.close() - os.remove(pth) + df = tm.makeTimeDataFrame() + self.store.remove('df1') + self.store.append('df1', df[:10]) + self.store.append('df1', df[10:]) + tm.assert_frame_equal(self.store['df1'], df) + + self.store.remove('df2') + self.store.put('df2', df[:10], table=True) + self.store.append('df2', df[10:]) + tm.assert_frame_equal(self.store['df2'], df) + + self.store.remove('df3') + self.store.append('/df3', df[:10]) + self.store.append('/df3', df[10:]) + tm.assert_frame_equal(self.store['df3'], df) + + # this is allowed by almost always don't want to do it + import warnings + import tables + warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) + self.store.remove('/df3 foo') + self.store.append('/df3 foo', df[:10]) + self.store.append('/df3 foo', df[10:]) + tm.assert_frame_equal(self.store['df3 foo'], df) + warnings.filterwarnings('always', category=tables.NaturalNameWarning) + + # panel + wp = tm.makePanel() + self.store.remove('wp1') + self.store.append('wp1', wp.ix[:,:10,:]) + self.store.append('wp1', wp.ix[:,10:,:]) + tm.assert_panel_equal(self.store['wp1'], wp) + + # ndim + p4d = tm.makePanel4D() + self.store.remove('p4d') + self.store.append('p4d', p4d.ix[:,:,:10,:]) + self.store.append('p4d', p4d.ix[:,:,10:,:]) + tm.assert_panel4d_equal(self.store['p4d'], p4d) + + # test using axis labels + self.store.remove('p4d') + self.store.append('p4d', p4d.ix[:,:,:10,:], axes=['items','major_axis','minor_axis']) + self.store.append('p4d', p4d.ix[:,:,10:,:], axes=['items','major_axis','minor_axis']) + tm.assert_panel4d_equal(self.store['p4d'], p4d) + + def test_append_frame_column_oriented(self): + + # column oriented + df = tm.makeTimeDataFrame() + self.store.remove('df1') + self.store.append('df1', df.ix[:,:2], axes = ['columns']) + self.store.append('df1', df.ix[:,2:]) + tm.assert_frame_equal(self.store['df1'], df) + + result = self.store.select('df1', 'columns=A') + expected = df.reindex(columns=['A']) + tm.assert_frame_equal(expected, result) + + # this isn't supported + self.assertRaises(Exception, self.store.select, 'df1', ('columns=A', Term('index','>',df.index[4]))) + + # selection on the non-indexable + result = self.store.select('df1', ('columns=A', Term('index','=',df.index[0:4]))) + expected = df.reindex(columns=['A'],index=df.index[0:4]) + tm.assert_frame_equal(expected, result) + + def test_ndim_indexables(self): + """ test using ndim tables in new ways""" + + p4d = tm.makePanel4D() + + # append then change (will take existing schema) + self.store.remove('p4d') + self.store.append('p4d', p4d.ix[:,:,:10,:], axes=['items','major_axis','minor_axis']) + self.store.append('p4d', p4d.ix[:,:,10:,:], axes=['labels','items','major_axis']) + + # pass incorrect number of axes + self.store.remove('p4d') + self.assertRaises(Exception, self.store.append, 'p4d', p4d.ix[:,:,:10,:], axes=['major_axis','minor_axis']) + + # different than default indexables + self.store.remove('p4d') + self.store.append('p4d', p4d.ix[:,:,:10,:], axes=[0,2,3]) + self.store.append('p4d', p4d.ix[:,:,10:,:], axes=[0,2,3]) + tm.assert_panel4d_equal(self.store['p4d'], p4d) + + # partial selection + result = self.store.select('p4d',['labels=l1']) + expected = p4d.reindex(labels = ['l1']) + tm.assert_panel4d_equal(result, expected) + + # partial selection2 + result = self.store.select('p4d',[Term('labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) + expected = p4d.reindex(labels = ['l1'], items = ['ItemA'], minor_axis = ['B']) + tm.assert_panel4d_equal(result, expected) + + # non-existant partial selection + result = self.store.select('p4d',[Term('labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) + expected = p4d.reindex(labels = ['l1'], items = [], minor_axis = ['B']) + tm.assert_panel4d_equal(result, expected) def test_append_with_strings(self): wp = tm.makePanel() @@ -317,6 +375,21 @@ def _make_one_panel(): self.store.append('p1_mixed', p1) tm.assert_panel_equal(self.store.select('p1_mixed'), p1) + # ndim + def _make_one_p4d(): + wp = tm.makePanel4D() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['l1'] > 0 + wp['bool2'] = wp['l2'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + return wp.consolidate() + + p4d = _make_one_p4d() + self.store.append('p4d_mixed', p4d) + tm.assert_panel4d_equal(self.store.select('p4d_mixed'), p4d) + def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() @@ -366,7 +439,10 @@ def test_remove_where(self): # empty where self.store.remove('wp') self.store.put('wp', wp, table=True) - self.store.remove('wp', []) + + # deleted number (entire table) + n = self.store.remove('wp', []) + assert(n == 120) # non - empty where self.store.remove('wp') @@ -387,8 +463,14 @@ def test_remove_crit(self): crit1 = Term('major_axis','>',date) crit2 = Term('minor_axis',['A', 'D']) - self.store.remove('wp', where=[crit1]) - self.store.remove('wp', where=[crit2]) + n = self.store.remove('wp', where=[crit1]) + + # deleted number + assert(n == 56) + + n = self.store.remove('wp', where=[crit2]) + assert(n == 32) + result = self.store['wp'] expected = wp.truncate(after=date).reindex(minor=['B', 'C']) tm.assert_panel_equal(result, expected) @@ -447,8 +529,8 @@ def test_terms(self): tm.assert_panel_equal(result, expected) # p4d - result = self.store.select('p4d',[ Term('major_axis<20000108'), Term('minor_axis', '=', ['A','B']) ]) - expected = p4d.truncate(after='20000108').reindex(minor=['A', 'B']) + result = self.store.select('p4d',[ Term('major_axis<20000108'), Term('minor_axis', '=', ['A','B']), Term('items', '=', ['ItemA','ItemB']) ]) + expected = p4d.truncate(after='20000108').reindex(minor=['A', 'B'],items=['ItemA','ItemB']) tm.assert_panel4d_equal(result, expected) # valid terms @@ -464,12 +546,23 @@ def test_terms(self): (('minor_axis', ['A','B']),), (('minor_axis', ['A','B']),), ((('minor_axis', ['A','B']),),), + (('items', ['ItemA','ItemB']),), + ('items=ItemA'), ] for t in terms: self.store.select('wp', t) self.store.select('p4d', t) + # valid for p4d only + terms = [ + (('labels', '=', ['l1','l2']),), + Term('labels', '=', ['l1','l2']), + ] + + for t in terms: + self.store.select('p4d', t) + def test_series(self): s = tm.makeStringSeries() self._check_roundtrip(s, tm.assert_series_equal) From 03011c11638ac550cf000635a81dbf099648e0cc Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Dec 2012 09:47:34 -0500 Subject: [PATCH 02/13] BUG: removed table check in select if where is provided (convience really) allow types in Term that are datetime-like (e.g. can provide a timetuple method) added a warning if you try to select/remove with a where criteria on a legacy table (which isn't supported), you must convert to new format added versioning ability, 'pandas_version', can't detect future format changes (not a required attribute) --- pandas/io/pytables.py | 17 +++++++++++--- pandas/io/tests/test_pytables.py | 39 +++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e5d0dd76de5a7..fba11e4f2963a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -10,6 +10,7 @@ import re import copy import itertools +import warnings import numpy as np from pandas import ( @@ -34,6 +35,11 @@ from contextlib import contextmanager +# versioning attribute +_version = '0.10' + +class IncompatibilityWarning(Warning): pass + # reading and writing the full object in one go _TYPE_MAP = { Series: 'series', @@ -341,8 +347,6 @@ def select(self, key, where=None): group = self.get_node(key) if group is None: raise KeyError('No object named %s in the file' % key) - if where is not None and not _is_table_type(group): - raise Exception('can only select with where on objects written as tables') return self._read_group(group, where) def put(self, key, value, table=False, append=False, @@ -498,6 +502,7 @@ def _write_to_group(self, key, value, table=False, append=False, wrapper(value) group._v_attrs.pandas_type = kind + group._v_attrs.pandas_version = _version def _write_series(self, group, series): self._write_index(group, 'index', series.index) @@ -1123,6 +1128,7 @@ class Table(object): def __init__(self, parent, group): self.parent = parent self.group = group + self.version = getattr(group._v_attrs,'version',None) self.index_axes = [] self.non_index_axes = [] self.values_axes = [] @@ -1475,6 +1481,11 @@ def write(self, **kwargs): def read(self, where=None): """ we have n indexable columns, with an arbitrary number of data axes """ + # are we trying to operate on an old version? + if where is not None: + if self.version is None or self.version < 0.1: + warnings.warn("where criteria is being ignored as we this version is too old (or not-defined) [%s]" % self.version, IncompatibilityWarning) + if not self.read_axes(where): return None indicies = [ i.values for i in self.index_axes ] @@ -2114,7 +2125,7 @@ def convert_value(self, v): if self.field == 'index' or self.field == 'major_axis': if self.kind == 'datetime64' : return [lib.Timestamp(v).value, None] - elif isinstance(v, datetime): + elif isinstance(v, datetime) or hasattr(v,'timetuple'): return [time.mktime(v.timetuple()), None] elif not isinstance(v, basestring): return [str(v), None] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 455182fc21e5e..1fa454f0a3933 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2,13 +2,14 @@ import unittest import os import sys +import warnings from datetime import datetime import numpy as np from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, date_range, Index) -from pandas.io.pytables import HDFStore, get_store, Term +from pandas.io.pytables import HDFStore, get_store, Term, IncompatibilityWarning import pandas.util.testing as tm from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal @@ -85,6 +86,17 @@ def test_contains(self): self.assert_('/foo/b' not in self.store) self.assert_('bar' not in self.store) + def test_versioning(self): + self.store['a'] = tm.makeTimeSeries() + self.store['b'] = tm.makeDataFrame() + df = tm.makeTimeDataFrame() + self.store.remove('df1') + self.store.append('df1', df[:10]) + self.store.append('df1', df[10:]) + self.assert_(self.store.root.a._v_attrs.pandas_version == '0.10') + self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10') + self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10') + def test_reopen_handle(self): self.store['a'] = tm.makeTimeSeries() self.store.open('w', warn=False) @@ -176,8 +188,6 @@ def test_append(self): tm.assert_frame_equal(self.store['df3'], df) # this is allowed by almost always don't want to do it - import warnings - import tables warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) self.store.remove('/df3 foo') self.store.append('/df3 foo', df[:10]) @@ -451,9 +461,9 @@ def test_remove_where(self): 'wp', ['foo']) # selectin non-table with a where - self.store.put('wp2', wp, table=False) - self.assertRaises(Exception, self.store.remove, - 'wp2', [('column', ['A', 'D'])]) + #self.store.put('wp2', wp, table=False) + #self.assertRaises(Exception, self.store.remove, + # 'wp2', [('column', ['A', 'D'])]) def test_remove_crit(self): @@ -890,8 +900,8 @@ def test_select(self): self.store.select('wp2') # selectin non-table with a where - self.assertRaises(Exception, self.store.select, - 'wp2', ('column', ['A', 'D'])) + #self.assertRaises(Exception, self.store.select, + # 'wp2', ('column', ['A', 'D'])) def test_panel_select(self): wp = tm.makePanel() @@ -927,9 +937,9 @@ def test_frame_select(self): tm.assert_frame_equal(result, expected) # can't select if not written as table - self.store['frame'] = df - self.assertRaises(Exception, self.store.select, - 'frame', [crit1, crit2]) + #self.store['frame'] = df + #self.assertRaises(Exception, self.store.select, + # 'frame', [crit1, crit2]) def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) @@ -1004,6 +1014,13 @@ def test_legacy_table_read(self): store.select('df1') store.select('df2') store.select('wp1') + + # old version (this still throws an exception though) + import warnings + warnings.filterwarnings('ignore', category=IncompatibilityWarning) + self.assertRaises(Exception, store.select, 'wp1', Term('minor_axis','=','B')) + warnings.filterwarnings('always', category=IncompatibilityWarning) + store.close() def test_legacy_table_write(self): From 3deb62d84a04136bbf85e6e403962eeabd9903b3 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Dec 2012 11:18:51 -0500 Subject: [PATCH 03/13] ENH: added meta data attribute saving --- pandas/io/pytables.py | 8 +++++++- pandas/io/tests/test_pytables.py | 24 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fba11e4f2963a..35b907d482077 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -503,6 +503,7 @@ def _write_to_group(self, key, value, table=False, append=False, wrapper(value) group._v_attrs.pandas_type = kind group._v_attrs.pandas_version = _version + group._v_attrs.meta = getattr(value,'meta',None) def _write_series(self, group, series): self._write_index(group, 'index', series.index) @@ -842,7 +843,12 @@ def _read_group(self, group, where=None): kind = group._v_attrs.pandas_type kind = _LEGACY_MAP.get(kind, kind) handler = self._get_handler(op='read', kind=kind) - return handler(group, where) + v = handler(group, where) + if v is not None: + meta = getattr(group._v_attrs,'meta',None) + if meta is not None: + v.meta = meta + return v def _read_series(self, group, where=None): index = self._read_index(group, 'index') diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 1fa454f0a3933..4c2567016d7ae 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -97,6 +97,30 @@ def test_versioning(self): self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10') self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10') + def test_meta(self): + meta = { 'foo' : [ 'I love pandas ' ] } + s = tm.makeTimeSeries() + s.meta = meta + self.store['a'] = s + self.assert_(self.store['a'].meta == meta) + + df = tm.makeDataFrame() + df.meta = meta + self.store['b'] = df + self.assert_(self.store['b'].meta == meta) + + # this should work, but because slicing doesn't propgate meta it doesn + self.store.remove('df1') + self.store.append('df1', df[:10]) + self.store.append('df1', df[10:]) + results = self.store['df1'] + #self.assert_(getattr(results,'meta',None) == meta) + + # no meta + df = tm.makeDataFrame() + self.store['b'] = df + self.assert_(hasattr(self.store['b'],'meta') == False) + def test_reopen_handle(self): self.store['a'] = tm.makeTimeSeries() self.store.open('w', warn=False) From 4d95fc9aa29dd7a80d4be035a6c5900f328c68db Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Dec 2012 12:50:21 -0500 Subject: [PATCH 04/13] BUG: better detection of legacy_frame tables (and ability to force certain typ's) --- pandas/io/pytables.py | 19 ++++++++++--------- pandas/io/tests/test_pytables.py | 3 +++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 35b907d482077..bd271b05c5668 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -333,7 +333,7 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None): + def select(self, key, where=None, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -347,7 +347,7 @@ def select(self, key, where=None): group = self.get_node(key) if group is None: raise KeyError('No object named %s in the file' % key) - return self._read_group(group, where) + return self._read_group(group, where, **kwargs) def put(self, key, value, table=False, append=False, compression=None, **kwargs): @@ -637,8 +637,8 @@ def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, **kw t.write(axes=axes, obj=obj, append=append, compression=comp, **kwargs) - def _read_ndim_table(self, group, where=None): - t = create_table(self, group) + def _read_ndim_table(self, group, where=None, **kwargs): + t = create_table(self, group, **kwargs) return t.read(where) def _write_frame_table(self, group, df, append=False, comp=None, axes=None, **kwargs): @@ -839,11 +839,11 @@ def _write_array(self, group, key, value): getattr(group, key)._v_attrs.transposed = transposed - def _read_group(self, group, where=None): + def _read_group(self, group, where=None, **kwargs): kind = group._v_attrs.pandas_type kind = _LEGACY_MAP.get(kind, kind) handler = self._get_handler(op='read', kind=kind) - v = handler(group, where) + v = handler(group, where, **kwargs) if v is not None: meta = getattr(group._v_attrs,'meta',None) if meta is not None: @@ -1131,7 +1131,7 @@ class Table(object): obj_type = None ndim = None - def __init__(self, parent, group): + def __init__(self, parent, group, **kwargs): self.parent = parent self.group = group self.version = getattr(group._v_attrs,'version',None) @@ -1794,7 +1794,7 @@ def create_table(parent, group, typ = None, **kwargs): """ return a suitable Table class to operate """ pt = getattr(group._v_attrs,'pandas_type',None) - tt = getattr(group._v_attrs,'table_type',None) + tt = getattr(group._v_attrs,'table_type',None) or typ # a new node if pt is None: @@ -1807,7 +1807,8 @@ def create_table(parent, group, typ = None, **kwargs): # distiguish between a frame/table tt = 'legacy_panel' try: - if group.table.description.values.shape[0] == 1: + fields = group.table._v_attrs.fields + if len(fields) == 1 and fields[0] == 'value': tt = 'legacy_frame' except: pass diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 4c2567016d7ae..ad11fb875c8a8 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1039,6 +1039,9 @@ def test_legacy_table_read(self): store.select('df2') store.select('wp1') + # force the frame + store.select('df2', typ = 'legacy_frame') + # old version (this still throws an exception though) import warnings warnings.filterwarnings('ignore', category=IncompatibilityWarning) From 98030301bce1fdd2711a720aa2e26d9485324bdf Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Dec 2012 14:00:58 -0500 Subject: [PATCH 05/13] BUG: fixed version checking for PyTables >= 2.3 (and put the correct tests in this time) --- pandas/io/pytables.py | 12 ++++++++---- pandas/io/tests/test_pytables.py | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index bd271b05c5668..ad91c75628f37 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -91,9 +91,12 @@ def _tables(): _table_mod = tables # version requirements - major, minor, subv = tables.__version__.split('.') - if int(major) >= 2 and int(minor[0]) >= 3: - _table_supports_index = True + ver = tables.__version__.split('.') + try: + if int(ver[0]) >= 2 and int(ver[1][0]) >= 3: + _table_supports_index = True + except: + pass return _table_mod @@ -437,8 +440,9 @@ def create_table_index(self, key, **kwargs): """ # version requirements + _tables() if not _table_supports_index: - raise("PyTables >= 2.3 is required for table indexing") + raise Exception("PyTables >= 2.3 is required for table indexing") group = self.get_node(key) if group is None: return diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index ad11fb875c8a8..88f7174631b80 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -345,6 +345,25 @@ def test_create_table_index(self): pytables._table_supports_index = False self.assertRaises(Exception, self.store.create_table_index, 'f') + # test out some versions + original = tables.__version__ + + for v in ['2.2','2.2b']: + pytables._table_mod = None + pytables._table_supports_index = False + tables.__version__ = v + self.assertRaises(Exception, self.store.create_table_index, 'f') + + for v in ['2.3.1','2.3.1b','2.4dev','2.4',original]: + pytables._table_mod = None + pytables._table_supports_index = False + tables.__version__ = v + self.store.create_table_index('f') + pytables._table_mod = None + pytables._table_supports_index = False + tables.__version__ = original + + def test_append_diff_item_order(self): wp = tm.makePanel() wp1 = wp.ix[:, :10, :] From 0cd3db12a184121036def96a76b48a67ab9f3c51 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Dec 2012 15:37:29 -0500 Subject: [PATCH 06/13] BUG: fixes most of the memory issues bug in concat with single object not sure about block2d_to_blocknd memory increase.... --- pandas/io/pytables.py | 39 ++++++++++++++++++++++++++++++-- pandas/io/tests/test_pytables.py | 17 ++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ad91c75628f37..9499cfe2d38f7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1064,6 +1064,11 @@ def set_data(self, data): if self.dtype is None: self.dtype = data.dtype.name + def take_data(self): + """ return the data & release the memory """ + self.data, data = None, self.data + return data + @property def shape(self): return getattr(self.data,'shape',None) @@ -1491,6 +1496,10 @@ def write(self, **kwargs): def read(self, where=None): """ we have n indexable columns, with an arbitrary number of data axes """ + + _dm = create_debug_memory(self.parent) + _dm('start') + # are we trying to operate on an old version? if where is not None: if self.version is None or self.version < 0.1: @@ -1498,6 +1507,7 @@ def read(self, where=None): if not self.read_axes(where): return None + _dm('read_axes') indicies = [ i.values for i in self.index_axes ] factors = [ Factor.from_array(i) for i in indicies ] levels = [ f.levels for f in factors ] @@ -1517,11 +1527,13 @@ def read(self, where=None): for c in self.values_axes: # the data need to be sorted - sorted_values = c.data.take(sorter, axis=0) + sorted_values = c.take_data().take(sorter, axis=0) take_labels = [ l.take(sorter) for l in labels ] items = Index(c.values) + _dm('pre block') block = block2d_to_blocknd(sorted_values, items, tuple(N), take_labels) + _dm('block created done') # create the object mgr = BlockManager([block], [items] + levels) @@ -1559,8 +1571,15 @@ def read(self, where=None): lp = DataFrame(new_values, index=new_index, columns=lp.columns) objs.append(lp.to_panel()) + _dm('pre-concat') + # create the composite object - wp = concat(objs, axis = 0, verify_integrity = True) + if len(objs) == 1: + wp = objs[0] + else: + wp = concat(objs, axis = 0, verify_integrity = True) + + _dm('post-concat') # reorder by any non_index_axes for axis,labels in self.non_index_axes: @@ -1573,6 +1592,8 @@ def read(self, where=None): new_axis = sorted(ordered & self.selection.filter) wp = wp.reindex(**{ filter_axis_name : new_axis, 'copy' : False }) + _dm('done') + return wp class LegacyFrameTable(LegacyTable): @@ -2209,3 +2230,17 @@ def f(values, freq=None, tz=None): tz=tz) return f return klass + +def create_debug_memory(parent): + _debug_memory = getattr(parent,'_debug_memory',False) + if not _debug_memory: + def get_memory(s): + pass + else: + import psutil, os + def get_memory(s): + p = psutil.Process(os.getpid()) + (rss,vms) = p.get_memory_info() + mp = p.get_memory_percent() + print "[%s] cur_mem->%.2f (MB),per_mem->%.2f" % (s,rss/1000000.0,mp) + return get_memory diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 88f7174631b80..b2d12d2d0fe02 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -364,6 +364,23 @@ def test_create_table_index(self): tables.__version__ = original + def test_big_table(self): + + # create and write a big table + wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%s' % i for i in xrange(20) ], + major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%s' % i for i in xrange(1000) ]) + + wp.ix[:,100:200,300:400] = np.nan + + try: + store = HDFStore(self.scratchpath) + store._debug_memory = True + store.append('wp',wp) + recons = store.select('wp') + finally: + store.close() + os.remove(self.scratchpath) + def test_append_diff_item_order(self): wp = tm.makePanel() wp1 = wp.ix[:, :10, :] From 9ddfb74031aee3ca0a2f8c0b397a9c1f89e8f85b Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Dec 2012 15:58:26 -0500 Subject: [PATCH 07/13] BUG: non-datetime indicies were not being handled correctly in searchings (via Terms) added support for integer, float, date --- pandas/io/pytables.py | 8 +++++++- pandas/io/tests/test_pytables.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9499cfe2d38f7..984ac6d116efd 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2157,8 +2157,14 @@ def convert_value(self, v): if self.field == 'index' or self.field == 'major_axis': if self.kind == 'datetime64' : return [lib.Timestamp(v).value, None] - elif isinstance(v, datetime) or hasattr(v,'timetuple'): + elif isinstance(v, datetime) or hasattr(v,'timetuple') or self.kind == 'date': return [time.mktime(v.timetuple()), None] + elif self.kind == 'integer': + v = int(float(v)) + return [v, v] + elif self.kind == 'float': + v = float(v) + return [v, v] elif not isinstance(v, basestring): return [str(v), None] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index b2d12d2d0fe02..8bb5fdaf5ac4c 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -996,6 +996,17 @@ def test_frame_select(self): expected = df.ix[:, ['A']] tm.assert_frame_equal(result, expected) + # other indicies for a frame + + # integer + df = DataFrame(dict(A = np.random.rand(20), B = np.random.rand(20))) + self.store.append('df_int', df) + self.store.select('df_int', [ Term("index<10"), Term("columns", "=", ["A"]) ]) + + df = DataFrame(dict(A = np.random.rand(20), B = np.random.rand(20), index = np.arange(20,dtype='f8'))) + self.store.append('df_float', df) + self.store.select('df_float', [ Term("index<10.0"), Term("columns", "=", ["A"]) ]) + # can't select if not written as table #self.store['frame'] = df #self.assertRaises(Exception, self.store.select, From e1fcb1d1b09410322e358b55ffefa525e32c26f9 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Dec 2012 17:43:46 -0500 Subject: [PATCH 08/13] BUG: fixed string truncation in values by passing min_itemsize = { 'values' : 1024 } --- pandas/io/pytables.py | 12 ++++++++++-- pandas/io/tests/test_pytables.py | 10 ++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 984ac6d116efd..3d39bfc602799 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1403,8 +1403,16 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None): # a string column if b.dtype.name == 'object': - atom = _tables().StringCol(itemsize = values.dtype.itemsize, shape = shape) - utype = 'S8' + + # specified min_itemsize? + if isinstance(min_itemsize, dict): + min_itemsize = int(min_itemsize.get('values')) + + if min_itemsize is None: + min_itemsize = values.dtype.itemsize + + atom = _tables().StringCol(itemsize = min_itemsize, shape = shape) + utype = 'S%s' % min_itemsize else: atom = getattr(_tables(),"%sCol" % b.dtype.name.capitalize())(shape = shape) utype = atom._deftype diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 8bb5fdaf5ac4c..20129894c9eb1 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -320,6 +320,16 @@ def test_append_with_strings(self): self.store.append('s4', wp) self.assertRaises(Exception, self.store.append, 's4', wp2) + # avoid truncation on elements + df = DataFrame([[123,'asdqwerty'], [345,'dggnhebbsdfbdfb']]) + self.store.append('df_big',df, min_itemsize = { 'values' : 1024 }) + tm.assert_frame_equal(self.store.select('df_big'), df) + + # avoid truncation on elements + df = DataFrame([[123,'asdqwerty'], [345,'dggnhebbsdfbdfb']]) + self.store.append('df_big2',df, min_itemsize = { 'values' : 300 }) + tm.assert_frame_equal(self.store.select('df_big2'), df) + def test_create_table_index(self): wp = tm.makePanel() self.store.append('p5', wp) From 71a4420db4785d9642c957ca0fc4f55afb7293ef Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Dec 2012 18:59:36 -0500 Subject: [PATCH 09/13] DOC: small doc change w.r.t. min_itemsize --- doc/source/io.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 336634308c5bc..7f4c0d820b05a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1091,6 +1091,7 @@ Storing Mixed Types in a Table Storing mixed-dtype data is supported. Strings are store as a fixed-width using the maximum size of the appended column. Subsequent appends will truncate strings at this length. Passing ``min_itemsize = { column_name : size }`` as a paremeter to append will set a larger minimum for the column. Storing ``floats, strings, ints, bools`` are currently supported. +Pass ``min_itemsize`` with a ``column_name`` of values to effect a minimum pre-allocation of space for strings in the dataset. .. ipython:: python @@ -1151,7 +1152,7 @@ Notes & Caveats - You can not append/select/delete to a non-table (table creation is determined on the first append, or by passing ``table=True`` in a put operation) - ``HDFStore`` is **not-threadsafe for writing**. The underlying ``PyTables`` only supports concurrent reads (via threading or processes). If you need reading and writing *at the same time*, you need to serialize these operations in a single thread in a single process. You will corrupt your data otherwise. See the issue for more information. - - ``PyTables`` only supports fixed-width string columns in ``tables``. The sizes of a string based indexing column (e.g. *column* or *minor_axis*) are determined as the maximum size of the elements in that axis or by passing the parameter ``min_itemsize`` on the first table creation (``min_itemsize`` can be an integer or a dict of column name to an integer). If subsequent appends introduce elements in the indexing axis that are larger than the supported indexer, an Exception will be raised (otherwise you could have a silent truncation of these indexers, leading to loss of information). This is **ONLY** necessary for storing ``Panels`` (as the indexing column is stored directly in a column) + - ``PyTables`` only supports fixed-width string columns in ``tables``. The sizes of a string based indexing column (e.g. *column* or *minor_axis*) are determined as the maximum size of the elements in that axis or by passing the parameter ``min_itemsize`` on the first table creation (``min_itemsize`` can be an integer or a dict of column name to an integer). If subsequent appends introduce elements in the indexing axis that are larger than the supported indexer, an Exception will be raised (otherwise you could have a silent truncation of these indexers, leading to loss of information). .. ipython:: python From 9f6a2ed272ec7f94aa2bef5cb610ecde68761b73 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Dec 2012 09:45:13 -0500 Subject: [PATCH 10/13] BUG: fixed string appending when length of subsequent is longer/shorter that existing removed meta data saving disable memory tests (and put a try:except: around it) --- pandas/io/pytables.py | 77 +++++++++++++++++++++----------- pandas/io/tests/test_pytables.py | 41 ++++++++++++++++- 2 files changed, 89 insertions(+), 29 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3d39bfc602799..1bcb311dc9917 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -21,7 +21,7 @@ from pandas.tseries.api import PeriodIndex, DatetimeIndex from pandas.core.common import adjoin from pandas.core.algorithms import match, unique - +from pandas.core.strings import str_len from pandas.core.categorical import Factor from pandas.core.common import _asarray_tuplesafe, _try_sort from pandas.core.internals import BlockManager, make_block, form_blocks @@ -507,7 +507,7 @@ def _write_to_group(self, key, value, table=False, append=False, wrapper(value) group._v_attrs.pandas_type = kind group._v_attrs.pandas_version = _version - group._v_attrs.meta = getattr(value,'meta',None) + #group._v_attrs.meta = getattr(value,'meta',None) def _write_series(self, group, series): self._write_index(group, 'index', series.index) @@ -848,10 +848,10 @@ def _read_group(self, group, where=None, **kwargs): kind = _LEGACY_MAP.get(kind, kind) handler = self._get_handler(op='read', kind=kind) v = handler(group, where, **kwargs) - if v is not None: - meta = getattr(group._v_attrs,'meta',None) - if meta is not None: - v.meta = meta + #if v is not None: + # meta = getattr(group._v_attrs,'meta',None) + # if meta is not None: + # v.meta = meta return v def _read_series(self, group, where=None): @@ -1001,16 +1001,22 @@ def validate_and_set(self, table, append, **kwargs): self.validate_attr(append) self.set_attr() - def validate_col(self): - """ validate this column & set table data for it """ + def validate_col(self, itemsize = None): + """ validate this column: return the compared against itemsize """ # validate this column for string truncation (or reset to the max size) - if self.kind == 'string': + dtype = getattr(self,'dtype',None) + if self.kind == 'string' or (dtype is not None and dtype.startswith('string')): c = self.col if c is not None: - if c.itemsize < self.itemsize: - raise Exception("[%s] column has a min_itemsize of [%s] but itemsize [%s] is required!" % (self.cname,self.itemsize,c.itemsize)) + if itemsize is None: + itemsize = self.itemsize + if c.itemsize < itemsize: + raise Exception("[%s] column has a min_itemsize of [%s] but itemsize [%s] is required!" % (self.cname,itemsize,c.itemsize)) + return c.itemsize + + return None def validate_attr(self, append): @@ -1404,18 +1410,27 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None): # a string column if b.dtype.name == 'object': + # itemsize is the maximum length of a string (along any dimension) + itemsize = _itemsize_string_array(values) + # specified min_itemsize? if isinstance(min_itemsize, dict): - min_itemsize = int(min_itemsize.get('values')) + itemsize = max(int(min_itemsize.get('values')),itemsize) + + # check for column in the values conflicts + if existing_table is not None and validate: + eci = existing_table.values_axes[i].validate_col(itemsize) + if eci > itemsize: + itemsize = eci - if min_itemsize is None: - min_itemsize = values.dtype.itemsize + atom = _tables().StringCol(itemsize = itemsize, shape = shape) + utype = 'S%s' % itemsize + kind = 'string' - atom = _tables().StringCol(itemsize = min_itemsize, shape = shape) - utype = 'S%s' % min_itemsize else: atom = getattr(_tables(),"%sCol" % b.dtype.name.capitalize())(shape = shape) utype = atom._deftype + kind = b.dtype.name # coerce data to this type try: @@ -1423,7 +1438,7 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None): except (Exception), detail: raise Exception("cannot coerce data type -> [dtype->%s]" % b.dtype.name) - dc = DataCol.create_for_block(i = i, values = list(b.items), kind = b.dtype.name, typ = atom, data = values, pos = j) + dc = DataCol.create_for_block(i = i, values = list(b.items), kind = kind, typ = atom, data = values, pos = j) j += 1 self.values_axes.append(dc) @@ -1663,7 +1678,6 @@ def write_data(self): """ fast writing of data: requires specific cython routines each axis shape """ # create the masks & values - #import pdb; pdb.set_trace() masks = [] for a in self.values_axes: @@ -1694,7 +1708,6 @@ def write_data(self): if len(rows): self.table.append(rows) except (Exception), detail: - #import pdb; pdb.set_trace() raise Exception("tables cannot write this data -> %s" % str(detail)) def delete(self, where = None): @@ -1849,6 +1862,10 @@ def create_table(parent, group, typ = None, **kwargs): return _TABLE_MAP.get(tt)(parent, group, **kwargs) +def _itemsize_string_array(arr): + """ return the maximum size of elements in a strnig array """ + return max([ str_len(arr[v]).max() for v in range(arr.shape[0]) ]) + def _convert_index(index): if isinstance(index, DatetimeIndex): converted = index.asi8 @@ -2247,14 +2264,20 @@ def f(values, freq=None, tz=None): def create_debug_memory(parent): _debug_memory = getattr(parent,'_debug_memory',False) + def get_memory(s): + pass + if not _debug_memory: - def get_memory(s): - pass + pass else: - import psutil, os - def get_memory(s): - p = psutil.Process(os.getpid()) - (rss,vms) = p.get_memory_info() - mp = p.get_memory_percent() - print "[%s] cur_mem->%.2f (MB),per_mem->%.2f" % (s,rss/1000000.0,mp) + try: + import psutil, os + def get_memory(s): + p = psutil.Process(os.getpid()) + (rss,vms) = p.get_memory_info() + mp = p.get_memory_percent() + print "[%s] cur_mem->%.2f (MB),per_mem->%.2f" % (s,rss/1000000.0,mp) + except: + pass + return get_memory diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 20129894c9eb1..d9030b15dde71 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -98,6 +98,8 @@ def test_versioning(self): self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10') def test_meta(self): + raise nose.SkipTest('no meta') + meta = { 'foo' : [ 'I love pandas ' ] } s = tm.makeTimeSeries() s.meta = meta @@ -167,6 +169,29 @@ def test_put(self): self.store.put('c', df[:10], table=True, append=False) tm.assert_frame_equal(df[:10], self.store['c']) + def test_put_string_index(self): + + index = Index([ "I am a very long string index: %s" % i for i in range(20) ]) + s = Series(np.arange(20), index = index) + df = DataFrame({ 'A' : s, 'B' : s }) + + self.store['a'] = s + tm.assert_series_equal(self.store['a'], s) + + self.store['b'] = df + tm.assert_frame_equal(self.store['b'], df) + + # mixed length + index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + [ "I am a very long string index: %s" % i for i in range(20) ]) + s = Series(np.arange(21), index = index) + df = DataFrame({ 'A' : s, 'B' : s }) + self.store['a'] = s + tm.assert_series_equal(self.store['a'], s) + + self.store['b'] = df + tm.assert_frame_equal(self.store['b'], df) + + def test_put_compression(self): df = tm.makeTimeDataFrame() @@ -325,11 +350,22 @@ def test_append_with_strings(self): self.store.append('df_big',df, min_itemsize = { 'values' : 1024 }) tm.assert_frame_equal(self.store.select('df_big'), df) + # appending smaller string ok + df2 = DataFrame([[124,'asdqy'], [346,'dggnhefbdfb']]) + self.store.append('df_big',df2) + expected = concat([ df, df2 ]) + tm.assert_frame_equal(self.store.select('df_big'), expected) + # avoid truncation on elements df = DataFrame([[123,'asdqwerty'], [345,'dggnhebbsdfbdfb']]) - self.store.append('df_big2',df, min_itemsize = { 'values' : 300 }) + self.store.append('df_big2',df, min_itemsize = { 'values' : 10 }) tm.assert_frame_equal(self.store.select('df_big2'), df) + # bigger string on next append + self.store.append('df_new',df, min_itemsize = { 'values' : 16 }) + df_new = DataFrame([[124,'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) + self.assertRaises(Exception, self.store.append, 'df_new',df_new) + def test_create_table_index(self): wp = tm.makePanel() self.store.append('p5', wp) @@ -375,7 +411,8 @@ def test_create_table_index(self): def test_big_table(self): - + raise nose.SkipTest('no big table') + # create and write a big table wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%s' % i for i in xrange(20) ], major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%s' % i for i in xrange(1000) ]) From 041951405cac7d4d8450c0bc23e5b4857d620def Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Dec 2012 10:25:42 -0500 Subject: [PATCH 11/13] BUG: fixed versioning of the data, not reporting correct warnings --- pandas/io/pytables.py | 16 ++++++++++------ pandas/io/tests/test_pytables.py | 7 +++++++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1bcb311dc9917..9361ad4df69b3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1149,7 +1149,7 @@ class Table(object): def __init__(self, parent, group, **kwargs): self.parent = parent self.group = group - self.version = getattr(group._v_attrs,'version',None) + self.version = getattr(group._v_attrs,'pandas_version',None) self.index_axes = [] self.non_index_axes = [] self.values_axes = [] @@ -1257,6 +1257,12 @@ def set_attrs(self): self.attrs.values_cols = self.values_cols() self.attrs.non_index_axes = self.non_index_axes + def validate_version(self, where = None): + """ are we trying to operate on an old version? """ + if where is not None: + if self.version is None or float(self.version) < 0.1: + warnings.warn("where criteria is being ignored as we this version is too old (or not-defined) [%s]" % self.version, IncompatibilityWarning) + def validate(self): """ raise if we have an incompitable table type with the current """ et = getattr(self.attrs,'table_type',None) @@ -1325,6 +1331,9 @@ def create_index(self, columns = None, optlevel = None, kind = None): def read_axes(self, where): """ create and return the axes sniffed from the table: return boolean for success """ + # validate the version + self.validate_version(where) + # infer the data kind if not self.infer_axes(): return False @@ -1523,11 +1532,6 @@ def read(self, where=None): _dm = create_debug_memory(self.parent) _dm('start') - # are we trying to operate on an old version? - if where is not None: - if self.version is None or self.version < 0.1: - warnings.warn("where criteria is being ignored as we this version is too old (or not-defined) [%s]" % self.version, IncompatibilityWarning) - if not self.read_axes(where): return None _dm('read_axes') diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d9030b15dde71..9cbc7d33d7fcc 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -97,6 +97,13 @@ def test_versioning(self): self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10') self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10') + # write a file and wipe its versioning + self.store.remove('df2') + self.store.append('df2', df) + self.store.get_node('df2')._v_attrs.pandas_version = None + self.store.select('df2') + self.store.select('df2', [ Term('index','>',df.index[2]) ]) + def test_meta(self): raise nose.SkipTest('no meta') From 93f75b3f791dc2147537bd210ec530d12bf30cf2 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Dec 2012 13:15:49 -0500 Subject: [PATCH 12/13] ENH: allow index recreation by calling create_table_index with new parameters --- pandas/io/pytables.py | 23 +++++++++++++++++++++-- pandas/io/tests/test_pytables.py | 21 ++++++++++++++++++--- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9361ad4df69b3..91bd27ff510ef 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1325,8 +1325,27 @@ def create_index(self, columns = None, optlevel = None, kind = None): table = self.table for c in columns: v = getattr(table.cols,c,None) - if v is not None and not v.is_indexed: - v.createIndex(**kw) + if v is not None: + + # remove the index if the kind/optlevel have changed + if v.is_indexed: + index = v.index + cur_optlevel = index.optlevel + cur_kind = index.kind + + if kind is not None and cur_kind != kind: + v.removeIndex() + else: + kw['kind'] = cur_kind + + if optlevel is not None and cur_optlevel != optlevel: + v.removeIndex() + else: + kw['optlevel'] = cur_optlevel + + # create the index + if not v.is_indexed: + v.createIndex(**kw) def read_axes(self, where): """ create and return the axes sniffed from the table: return boolean for success """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9cbc7d33d7fcc..a047109e509a9 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -381,14 +381,29 @@ def test_create_table_index(self): assert(self.store.handle.root.p5.table.cols.major_axis.is_indexed == True) assert(self.store.handle.root.p5.table.cols.minor_axis.is_indexed == False) + # default optlevels + assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 6) + assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') + + # let's change the indexing scheme + self.store.create_table_index('p5') + assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 6) + assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') + self.store.create_table_index('p5', optlevel=9) + assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 9) + assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') + self.store.create_table_index('p5', kind='full') + assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 9) + assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'full') + self.store.create_table_index('p5', optlevel=1, kind='light') + assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 1) + assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'light') + df = tm.makeTimeDataFrame() self.store.append('f', df[:10]) self.store.append('f', df[10:]) self.store.create_table_index('f') - # create twice - self.store.create_table_index('f') - # try to index a non-table self.store.put('f2', df) self.assertRaises(Exception, self.store.create_table_index, 'f2') From 77db9aa79b849745a4a3f1723a32941817dcb8c3 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Dec 2012 13:57:45 -0500 Subject: [PATCH 13/13] DOC: updated HDFStore docs for indexing support and better explanations on how to deal with strings in indexables/values --- doc/source/io.rst | 51 ++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 7f4c0d820b05a..2c5dac3931c0d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1001,7 +1001,7 @@ Objects can be written to the file just like adding key-value pairs to a dict: store['wp'] = wp # the type of stored data - store.handle.root.wp._v_attrs.pandas_type + store.root.wp._v_attrs.pandas_type store @@ -1037,8 +1037,7 @@ Storing in Table format ``HDFStore`` supports another ``PyTables`` format on disk, the ``table`` format. Conceptually a ``table`` is shaped very much like a DataFrame, with rows and columns. A ``table`` may be appended to in the same or other sessions. -In addition, delete & query type operations are supported. You can create an index with ``create_table_index`` -after data is already in the table (this may become automatic in the future or an option on appending/putting a ``table``). +In addition, delete & query type operations are supported. .. ipython:: python :suppress: @@ -1061,11 +1060,7 @@ after data is already in the table (this may become automatic in the future or a store.select('df') # the type of stored data - store.handle.root.df._v_attrs.pandas_type - - # create an index - store.create_table_index('df') - store.handle.root.df.table + store.root.df._v_attrs.pandas_type Hierarchical Keys ~~~~~~~~~~~~~~~~~ @@ -1090,8 +1085,7 @@ Storing Mixed Types in a Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Storing mixed-dtype data is supported. Strings are store as a fixed-width using the maximum size of the appended column. Subsequent appends will truncate strings at this length. -Passing ``min_itemsize = { column_name : size }`` as a paremeter to append will set a larger minimum for the column. Storing ``floats, strings, ints, bools`` are currently supported. -Pass ``min_itemsize`` with a ``column_name`` of values to effect a minimum pre-allocation of space for strings in the dataset. +Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools`` are currently supported. .. ipython:: python @@ -1100,11 +1094,14 @@ Pass ``min_itemsize`` with a ``column_name`` of values to effect a minimum pre-a df_mixed['int'] = 1 df_mixed['bool'] = True - store.append('df_mixed',df_mixed) + store.append('df_mixed', df_mixed, min_itemsize = { 'values' : 50 }) df_mixed1 = store.select('df_mixed') df_mixed1 df_mixed1.get_dtype_counts() + # we have provided a minimum string column size + store.root.df_mixed.table + Querying a Table ~~~~~~~~~~~~~~~~ @@ -1136,6 +1133,23 @@ Queries are built up using a list of ``Terms`` (currently only **anding** of ter store store.select('wp',[ 'major_axis>20000102', ('minor_axis', '=', ['A','B']) ]) +Indexing +~~~~~~~~ +You can create an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the indexed dimension as the ``where``. It is not automagically done now because you may want to index different axes than the default (except in the case of a DataFrame, where it almost always makes sense to index the ``index``. + +.. ipython:: python + + # create an index + store.create_table_index('df') + i = store.root.df.table.cols.index.index + i.optlevel, i.kind + + # change an index by passing new parameters + store.create_table_index('df', optlevel = 9, kind = 'full') + i = store.root.df.table.cols.index.index + i.optlevel, i.kind + + Delete from a Table ~~~~~~~~~~~~~~~~~~~ @@ -1152,27 +1166,30 @@ Notes & Caveats - You can not append/select/delete to a non-table (table creation is determined on the first append, or by passing ``table=True`` in a put operation) - ``HDFStore`` is **not-threadsafe for writing**. The underlying ``PyTables`` only supports concurrent reads (via threading or processes). If you need reading and writing *at the same time*, you need to serialize these operations in a single thread in a single process. You will corrupt your data otherwise. See the issue for more information. - - ``PyTables`` only supports fixed-width string columns in ``tables``. The sizes of a string based indexing column (e.g. *column* or *minor_axis*) are determined as the maximum size of the elements in that axis or by passing the parameter ``min_itemsize`` on the first table creation (``min_itemsize`` can be an integer or a dict of column name to an integer). If subsequent appends introduce elements in the indexing axis that are larger than the supported indexer, an Exception will be raised (otherwise you could have a silent truncation of these indexers, leading to loss of information). + - ``PyTables`` only supports fixed-width string columns in ``tables``. The sizes of a string based indexing column (e.g. *columns* or *minor_axis*) are determined as the maximum size of the elements in that axis or by passing the parameter ``min_itemsize`` on the first table creation (``min_itemsize`` can be an integer or a dict of column name to an integer). If subsequent appends introduce elements in the indexing axis that are larger than the supported indexer, an Exception will be raised (otherwise you could have a silent truncation of these indexers, leading to loss of information). Just to be clear, this fixed-width restriction applies to **indexables** (the indexing columns) and **string values** in a mixed_type table. .. ipython:: python - store.append('wp_big_strings', wp, min_itemsize = 30) + store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 }) wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2) store.append('wp_big_strings', wp) store.select('wp_big_strings') + # we have provided a minimum minor_axis indexable size + store.root.wp_big_strings.table + Compatibility ~~~~~~~~~~~~~ 0.10 of ``HDFStore`` is backwards compatible for reading tables created in a prior version of pandas, -however, query terms using the prior (undocumented) methodology are unsupported. You must read in the entire -file and write it out using the new format to take advantage of the updates. +however, query terms using the prior (undocumented) methodology are unsupported. ``HDFStore`` will issue a warning if you try to use a prior-version format file. You must read in the entire +file and write it out using the new format to take advantage of the updates. The group attribute ``pandas_version`` contains the version information. Performance ~~~~~~~~~~~ - - ``Tables`` come with a performance penalty as compared to regular stores. The benefit is the ability to append/delete and query (potentially very large amounts of data). + - ``Tables`` come with a writing performance penalty as compared to regular stores. The benefit is the ability to append/delete and query (potentially very large amounts of data). Write times are generally longer as compared with regular stores. Query times can be quite fast, especially on an indexed axis. - ``Tables`` can (as of 0.10.0) be expressed as different types. @@ -1180,8 +1197,6 @@ Performance - ``WORMTable`` (pending implementation) - is available to faciliate very fast writing of tables that are also queryable (but CANNOT support appends) - To delete a lot of data, it is sometimes better to erase the table and rewrite it. ``PyTables`` tends to increase the file size with deletions - - In general it is best to store Panels with the most frequently selected dimension in the minor axis and a time/date like dimension in the major axis, but this is not required. Panels can have any major_axis and minor_axis type that is a valid Panel indexer. - - No dimensions are currently indexed automagically (in the ``PyTables`` sense); these require an explict call to ``create_table_index`` - ``Tables`` offer better performance when compressed after writing them (as opposed to turning on compression at the very beginning) use the pytables utilities ``ptrepack`` to rewrite the file (and also can change compression methods) - Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs)