From e507c4a790c4f148316d183177117078c261c0da Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 25 Jan 2017 07:20:41 -0500 Subject: [PATCH 1/7] ENH: support MultiIndex and tuple hashing --- pandas/tools/hashing.py | 36 ++++++++++++++++++++++++++++-- pandas/tools/tests/test_hashing.py | 28 ++++++++++++++--------- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 6d2186fdab34c..4acf8f036a360 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -3,11 +3,12 @@ """ import numpy as np -from pandas import _hash, Series, factorize, Categorical, Index +from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype) + is_datetime64_dtype, is_timedelta64_dtype, + is_object_dtype) # 16 byte long hashing key _default_hash_key = '0123456789123456' @@ -45,6 +46,9 @@ def adder(h, hashed_to_add): h = np.multiply(h, np.uint(3), h) return np.add(h, hashed_to_add, h) + if isinstance(obj, MultiIndex): + return _hash_tuples(obj, encoding, hash_key) + if isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') @@ -80,6 +84,30 @@ def adder(h, hashed_to_add): return h +def _hash_tuples(vals, encoding, hash_key): + """ + Hash an MultiIndex / array_of_tuples efficiently + + Parameters + ---------- + vals : MultiIndex or ndarray of tuples + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + ndarray of hashed values array, same size as len(c) + """ + + if not isinstance(vals, MultiIndex): + vals = MultiIndex.from_tuples(vals) + + # efficiently turn us into a DataFrame and hash + return hash_pandas_object(vals.to_dataframe(index=False), + index=False, encoding=encoding, + hash_key=hash_key, categorize=False) + + def _hash_categorical(c, encoding, hash_key): """ Hash a Categorical by hashing its categories, and then mapping the codes @@ -127,6 +155,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key + if isinstance(vals, list) and len(vals) and isinstance(vals[0], tuple): + # we hash an list of tuples similar to a MultiIndex + return _hash_tuples(vals, encoding, hash_key).values + # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 7913706f5658b..211192261db3e 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from pandas import DataFrame, Series, Index +from pandas import DataFrame, Series, Index, MultiIndex from pandas.tools.hashing import hash_array, hash_pandas_object import pandas.util.testing as tm @@ -55,6 +55,18 @@ def check_not_equal_with_index(self, obj): b = hash_pandas_object(obj, index=False) self.assertFalse((a == b).all()) + def test_hash_list_tuples(self): + tups = [(1, 'one'), (1, 'two'), (2, 'one')] + result = hash_array(tups) + expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values + self.assert_numpy_array_equal(result, expected) + + def test_multiindex_unique(self): + mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) + self.assertTrue(mi.is_unique) + result = hash_pandas_object(mi) + self.assertTrue(result.is_unique) + def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), @@ -72,7 +84,11 @@ def test_hash_pandas_object(self): tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), - tm.makeTimedeltaIndex()]: + tm.makeTimedeltaIndex(), + MultiIndex.from_product( + [range(5), + ['foo', 'bar', 'baz'], + pd.date_range('20130101', periods=2)])]: self.check_equal(obj) self.check_not_equal_with_index(obj) @@ -140,14 +156,6 @@ def f(): hash_pandas_object(obj) self.assertRaises(TypeError, f) - # MultiIndex are represented as tuples - obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples( - [('a', 1), ('a', 2), ('b', 1)])) - - def f(): - hash_pandas_object(obj) - self.assertRaises(TypeError, f) - def test_alread_encoded(self): # if already encoded then ok From 44e9c7dc1d39d7f8553835cb1e54775a24c48848 Mon Sep 17 00:00:00 2001 From: Mike Graham Date: Wed, 25 Jan 2017 12:37:07 -0500 Subject: [PATCH 2/7] wipSteal the algorithm used to combine hashes from tupleobject.c --- pandas/tools/hashing.py | 61 ++++++++++++++++++------------ pandas/tools/tests/test_hashing.py | 3 +- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 4acf8f036a360..b6635b3d8e053 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -1,19 +1,34 @@ """ data hash pandas / numpy objects """ +import itertools import numpy as np from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_object_dtype) + is_datetime64_dtype, is_timedelta64_dtype) # 16 byte long hashing key _default_hash_key = '0123456789123456' +def _combine_hash_arrays(arrays, num_items): + first = next(arrays) + arrays = itertools.chain([first], arrays) + + mult = np.zeros_like(first) + np.uint64(1000003) + out = np.zeros_like(first) + np.uint64(0x345678) + for i, a in enumerate(arrays): + inverse_i = num_items - i + out = (out ^ a) * mult + mult += np.uint64(82520 + inverse_i + inverse_i) + assert i + 1 == num_items, 'Fed in wrong num_items' + out += np.uint64(97531) + return out + + def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, categorize=True): """ @@ -42,10 +57,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, if hash_key is None: hash_key = _default_hash_key - def adder(h, hashed_to_add): - h = np.multiply(h, np.uint(3), h) - return np.add(h, hashed_to_add, h) - if isinstance(obj, MultiIndex): return _hash_tuples(obj, encoding, hash_key) @@ -57,26 +68,28 @@ def adder(h, hashed_to_add): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') if index: - h = adder(h, hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values) + h = _combine_hash_arrays(iter([ + h, + hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize).values]), + 2) h = Series(h, index=obj.index, dtype='uint64') elif isinstance(obj, ABCDataFrame): - cols = obj.iteritems() - first_series = next(cols)[1] - h = hash_array(first_series.values, encoding, - hash_key, categorize).astype('uint64') - for _, col in cols: - h = adder(h, hash_array(col.values, encoding, hash_key, - categorize)) + hashes = (hash_array(series.values) for _, series in obj.iteritems()) + num_items = len(obj.columns) if index: - h = adder(h, hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values) + index_hash_generator = (hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize).values # noqa + for _ in [None]) + num_items += 1 + hashes = itertools.chain(hashes, index_hash_generator) + h = _combine_hash_arrays(hashes, num_items) h = Series(h, index=obj.index, dtype='uint64') else: @@ -103,7 +116,7 @@ def _hash_tuples(vals, encoding, hash_key): vals = MultiIndex.from_tuples(vals) # efficiently turn us into a DataFrame and hash - return hash_pandas_object(vals.to_dataframe(index=False), + return hash_pandas_object(vals.to_frame(index=False), index=False, encoding=encoding, hash_key=hash_key, categorize=False) diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 211192261db3e..4a2cb93899d21 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -62,7 +62,8 @@ def test_hash_list_tuples(self): self.assert_numpy_array_equal(result, expected) def test_multiindex_unique(self): - mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) + mi = MultiIndex.from_tuples([(118, 472), (236, 118), + (51, 204), (102, 51)]) self.assertTrue(mi.is_unique) result = hash_pandas_object(mi) self.assertTrue(result.is_unique) From e8dd6072430e433185b030f946a1cd83f8372511 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 25 Jan 2017 16:22:59 -0500 Subject: [PATCH 3/7] add hash_tuples --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tools/hashing.py | 88 +++++++++++++++++++++++------- pandas/tools/tests/test_hashing.py | 9 ++- 3 files changed, 74 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2dc15f2fe0781..626ed0b1bac61 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -145,6 +145,7 @@ Other enhancements - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). +- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index b6635b3d8e053..6b220f19b3dff 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -5,6 +5,7 @@ import numpy as np from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex +import pandas.core.algorithms as algos from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, @@ -58,15 +59,16 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, hash_key = _default_hash_key if isinstance(obj, MultiIndex): - return _hash_tuples(obj, encoding, hash_key) + return Series(hash_tuples(obj, encoding, hash_key), + dtype='uint64', copy=False) if isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64') - h = Series(h, index=obj, dtype='uint64') + categorize).astype('uint64', copy=False) + h = Series(h, index=obj, dtype='uint64', copy=False) elif isinstance(obj, ABCSeries): h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64') + categorize).astype('uint64', copy=False) if index: h = _combine_hash_arrays(iter([ h, @@ -76,7 +78,7 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, hash_key=hash_key, categorize=categorize).values]), 2) - h = Series(h, index=obj.index, dtype='uint64') + h = Series(h, index=obj.index, dtype='uint64', copy=False) elif isinstance(obj, ABCDataFrame): hashes = (hash_array(series.values) for _, series in obj.iteritems()) num_items = len(obj.columns) @@ -91,34 +93,81 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, hashes = itertools.chain(hashes, index_hash_generator) h = _combine_hash_arrays(hashes, num_items) - h = Series(h, index=obj.index, dtype='uint64') + h = Series(h, index=obj.index, dtype='uint64', copy=False) else: raise TypeError("Unexpected type for hashing %s" % type(obj)) return h -def _hash_tuples(vals, encoding, hash_key): +def _hash_lists(vals, encoding='utf8', hash_key=None): + """ + + Parameters + ---------- + vals : list of ndarrays + encoding : string, default 'utf8' + encoding for data & key when strings + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + 1d uint64 numpy array of hash values, same length as the vals[0] + """ + + if not isinstance(vals, list): + raise TypeError("only can accept lists") + + if not len(vals): + raise ValueError("must pass a non-zero length vals") + + if not isinstance(vals[0], np.ndarray): + raise ValueError("must pass a ndarray") + + hashes = (hash_array(l, encoding=encoding, hash_key=hash_key) + for l in vals) + h = _combine_hash_arrays(hashes, len(vals)) + return h + + +def hash_tuples(vals, encoding='utf8', hash_key=None): """ Hash an MultiIndex / array_of_tuples efficiently Parameters ---------- - vals : MultiIndex or ndarray of tuples + vals : MultiIndex, ndarray of tuples, or single tuple encoding : string, default 'utf8' hash_key : string key to encode, default to _default_hash_key Returns ------- - ndarray of hashed values array, same size as len(c) + ndarray of hashed values array """ + is_tuple = False + if isinstance(vals, tuple): + vals = [vals] + is_tuple = True + if not isinstance(vals, MultiIndex): vals = MultiIndex.from_tuples(vals) - # efficiently turn us into a DataFrame and hash - return hash_pandas_object(vals.to_frame(index=False), - index=False, encoding=encoding, - hash_key=hash_key, categorize=False) + # create a list-of-ndarrays & hash + def get_level_values(num): + unique = vals.levels[num] # .values + labels = vals.labels[num] + filled = algos.take_1d(unique.values, labels, + fill_value=unique._na_value) + return filled + + vals = [get_level_values(level) + for level in range(vals.nlevels)] + + result = _hash_lists(vals, encoding=encoding, hash_key=hash_key) + if is_tuple: + result = result[0] + + return result def _hash_categorical(c, encoding, hash_key): @@ -138,7 +187,7 @@ def _hash_categorical(c, encoding, hash_key): """ cat_hashed = hash_array(c.categories.values, encoding, hash_key, categorize=False).astype(np.uint64, copy=False) - return c.rename_categories(cat_hashed).astype(np.uint64) + return c.rename_categories(cat_hashed).astype(np.uint64, copy=False) def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): @@ -168,10 +217,6 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key - if isinstance(vals, list) and len(vals) and isinstance(vals[0], tuple): - # we hash an list of tuples similar to a MultiIndex - return _hash_tuples(vals, encoding, hash_key).values - # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. @@ -187,9 +232,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # manage it. if is_bool_array(vals): vals = vals.astype('u8') - elif ((is_datetime64_dtype(vals) or - is_timedelta64_dtype(vals) or - is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): + elif (is_datetime64_dtype(vals) or + is_timedelta64_dtype(vals)): + vals = vals.view('i8').astype('u8', copy=False) + elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 4a2cb93899d21..0eb7f5ef6c8ad 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import DataFrame, Series, Index, MultiIndex -from pandas.tools.hashing import hash_array, hash_pandas_object +from pandas.tools.hashing import hash_array, hash_tuples, hash_pandas_object import pandas.util.testing as tm @@ -55,12 +55,15 @@ def check_not_equal_with_index(self, obj): b = hash_pandas_object(obj, index=False) self.assertFalse((a == b).all()) - def test_hash_list_tuples(self): + def test_hash_tuples(self): tups = [(1, 'one'), (1, 'two'), (2, 'one')] - result = hash_array(tups) + result = hash_tuples(tups) expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values self.assert_numpy_array_equal(result, expected) + result = hash_tuples(tups[0]) + self.assertEqual(result, expected[0]) + def test_multiindex_unique(self): mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) From 0c13df7ea55d5018848708abfee6ff572855689c Mon Sep 17 00:00:00 2001 From: Mike Graham Date: Wed, 25 Jan 2017 12:37:07 -0500 Subject: [PATCH 4/7] Steal the algorithm used to combine hashes from tupleobject.c --- pandas/tools/hashing.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 6b220f19b3dff..281fbcaf0c22d 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -16,6 +16,7 @@ def _combine_hash_arrays(arrays, num_items): + "Should be the same as CPython's tupleobject.c" first = next(arrays) arrays = itertools.chain([first], arrays) @@ -23,7 +24,8 @@ def _combine_hash_arrays(arrays, num_items): out = np.zeros_like(first) + np.uint64(0x345678) for i, a in enumerate(arrays): inverse_i = num_items - i - out = (out ^ a) * mult + out ^= a + out *= mult mult += np.uint64(82520 + inverse_i + inverse_i) assert i + 1 == num_items, 'Fed in wrong num_items' out += np.uint64(97531) @@ -70,15 +72,17 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64', copy=False) if index: - h = _combine_hash_arrays(iter([ - h, - hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values]), - 2) + index_iter = (hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize).values + for _ in [None]) + arrays = itertools.chain([h], index_iter) + h = _combine_hash_arrays(arrays, 2) + h = Series(h, index=obj.index, dtype='uint64', copy=False) + elif isinstance(obj, ABCDataFrame): hashes = (hash_array(series.values) for _, series in obj.iteritems()) num_items = len(obj.columns) From 58f682d369e53f74830a3247f8976a8c6cc296e5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 26 Jan 2017 09:12:27 -0500 Subject: [PATCH 5/7] memory optimization --- pandas/tools/hashing.py | 26 ++++++++++++++++++++------ pandas/tools/tests/test_hashing.py | 10 +++++++++- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 281fbcaf0c22d..e3645fab9a164 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -9,18 +9,30 @@ from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype) + is_datetime64_dtype, is_timedelta64_dtype, + is_list_like) # 16 byte long hashing key _default_hash_key = '0123456789123456' def _combine_hash_arrays(arrays, num_items): - "Should be the same as CPython's tupleobject.c" - first = next(arrays) + """ + Parameters + ---------- + arrays : generator + num_items : int + + Should be the same as CPython's tupleobject.c + """ + try: + first = next(arrays) + except StopIteration: + return np.array([], dtype=np.uint64) + arrays = itertools.chain([first], arrays) - mult = np.zeros_like(first) + np.uint64(1000003) + mult = np.uint64(1000003) out = np.zeros_like(first) + np.uint64(0x345678) for i, a in enumerate(arrays): inverse_i = num_items - i @@ -135,11 +147,11 @@ def _hash_lists(vals, encoding='utf8', hash_key=None): def hash_tuples(vals, encoding='utf8', hash_key=None): """ - Hash an MultiIndex / array_of_tuples efficiently + Hash an MultiIndex / list-of-tuples efficiently Parameters ---------- - vals : MultiIndex, ndarray of tuples, or single tuple + vals : MultiIndex, list-of-tuples, or single tuple encoding : string, default 'utf8' hash_key : string key to encode, default to _default_hash_key @@ -152,6 +164,8 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): if isinstance(vals, tuple): vals = [vals] is_tuple = True + elif not is_list_like(vals): + raise TypeError("must be convertible to a list-of-tuples") if not isinstance(vals, MultiIndex): vals = MultiIndex.from_tuples(vals) diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 0eb7f5ef6c8ad..0deb5c8b89396 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -53,7 +53,8 @@ def check_not_equal_with_index(self, obj): if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) - self.assertFalse((a == b).all()) + if len(obj): + self.assertFalse((a == b).all()) def test_hash_tuples(self): tups = [(1, 'one'), (1, 'two'), (2, 'one')] @@ -64,6 +65,11 @@ def test_hash_tuples(self): result = hash_tuples(tups[0]) self.assertEqual(result, expected[0]) + def test_hash_tuples_err(self): + + for val in [5, 'foo', pd.Timestamp('20130101')]: + self.assertRaises(TypeError, hash_tuples, val) + def test_multiindex_unique(self): mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) @@ -81,9 +87,11 @@ def test_hash_pandas_object(self): Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), + Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), + DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), From 48a2402f02b6430e60a48f3b1ff0bb5248dce638 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 26 Jan 2017 18:26:09 -0500 Subject: [PATCH 6/7] support for mixed type arrays --- pandas/tools/hashing.py | 10 +++++++++- pandas/tools/tests/test_hashing.py | 14 +++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index e3645fab9a164..12228afe5e2c1 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -149,6 +149,8 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): """ Hash an MultiIndex / list-of-tuples efficiently + .. versionadded:: 0.20.0 + Parameters ---------- vals : MultiIndex, list-of-tuples, or single tuple @@ -265,7 +267,13 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) - vals = _hash.hash_object_array(vals, hash_key, encoding) + try: + vals = _hash.hash_object_array(vals, hash_key, encoding) + except TypeError: + + # we have mixed types + vals = _hash.hash_object_array(vals.astype(str).astype(object), + hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 0deb5c8b89396..9faed167b5a9a 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -36,6 +36,11 @@ def test_hash_array(self): a = s.values tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) + def test_hash_array_mixed(self): + for data in [np.array([3, 4, 'All']), + np.array([3, 4, 'All'], dtype=object)]: + tm.assert_numpy_array_equal(hash_array(data), hash_array(data)) + def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) @@ -159,15 +164,6 @@ def f(): hash_pandas_object(Series(list('abc')), hash_key='foo') self.assertRaises(ValueError, f) - def test_unsupported_objects(self): - - # mixed objects are not supported - obj = Series(['1', 2, 3]) - - def f(): - hash_pandas_object(obj) - self.assertRaises(TypeError, f) - def test_alread_encoded(self): # if already encoded then ok From 8b1d3f9c5718799b5b0f2e1d8069d16fab01324a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 26 Jan 2017 18:45:51 -0500 Subject: [PATCH 7/7] not correctly hashing categorical in a MI --- pandas/tools/hashing.py | 43 ++++++------------------------ pandas/tools/tests/test_hashing.py | 9 ++++++- 2 files changed, 16 insertions(+), 36 deletions(-) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 12228afe5e2c1..a62c80c6f8d67 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -115,36 +115,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, return h -def _hash_lists(vals, encoding='utf8', hash_key=None): - """ - - Parameters - ---------- - vals : list of ndarrays - encoding : string, default 'utf8' - encoding for data & key when strings - hash_key : string key to encode, default to _default_hash_key - - Returns - ------- - 1d uint64 numpy array of hash values, same length as the vals[0] - """ - - if not isinstance(vals, list): - raise TypeError("only can accept lists") - - if not len(vals): - raise ValueError("must pass a non-zero length vals") - - if not isinstance(vals[0], np.ndarray): - raise ValueError("must pass a ndarray") - - hashes = (hash_array(l, encoding=encoding, hash_key=hash_key) - for l in vals) - h = _combine_hash_arrays(hashes, len(vals)) - return h - - def hash_tuples(vals, encoding='utf8', hash_key=None): """ Hash an MultiIndex / list-of-tuples efficiently @@ -172,22 +142,25 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): if not isinstance(vals, MultiIndex): vals = MultiIndex.from_tuples(vals) - # create a list-of-ndarrays & hash + # create a list-of-ndarrays def get_level_values(num): unique = vals.levels[num] # .values labels = vals.labels[num] - filled = algos.take_1d(unique.values, labels, + filled = algos.take_1d(unique._values, labels, fill_value=unique._na_value) return filled vals = [get_level_values(level) for level in range(vals.nlevels)] - result = _hash_lists(vals, encoding=encoding, hash_key=hash_key) + # hash the list-of-ndarrays + hashes = (hash_array(l, encoding=encoding, hash_key=hash_key) + for l in vals) + h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: - result = result[0] + h = h[0] - return result + return h def _hash_categorical(c, encoding, hash_key): diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 9faed167b5a9a..ed5a74f8cfcf2 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -102,10 +102,17 @@ def test_hash_pandas_object(self): tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), + Series(tm.makePeriodIndex()), + Series(pd.date_range('20130101', + periods=3, tz='US/Eastern')), MultiIndex.from_product( [range(5), ['foo', 'bar', 'baz'], - pd.date_range('20130101', periods=2)])]: + pd.date_range('20130101', periods=2)]), + MultiIndex.from_product( + [pd.CategoricalIndex(list('aabc')), + range(3)])]: self.check_equal(obj) self.check_not_equal_with_index(obj)