From e507c4a790c4f148316d183177117078c261c0da Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 25 Jan 2017 07:20:41 -0500
Subject: [PATCH 1/7] ENH: support MultiIndex and tuple hashing

---
 pandas/tools/hashing.py            | 36 ++++++++++++++++++++++++++++--
 pandas/tools/tests/test_hashing.py | 28 ++++++++++++++---------
 2 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
index 6d2186fdab34c..4acf8f036a360 100644
--- a/pandas/tools/hashing.py
+++ b/pandas/tools/hashing.py
@@ -3,11 +3,12 @@
 """
 
 import numpy as np
-from pandas import _hash, Series, factorize, Categorical, Index
+from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
 from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
 from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
-                                 is_datetime64_dtype, is_timedelta64_dtype)
+                                 is_datetime64_dtype, is_timedelta64_dtype,
+                                 is_object_dtype)
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
@@ -45,6 +46,9 @@ def adder(h, hashed_to_add):
         h = np.multiply(h, np.uint(3), h)
         return np.add(h, hashed_to_add, h)
 
+    if isinstance(obj, MultiIndex):
+        return _hash_tuples(obj, encoding, hash_key)
+
     if isinstance(obj, ABCIndexClass):
         h = hash_array(obj.values, encoding, hash_key,
                        categorize).astype('uint64')
@@ -80,6 +84,30 @@ def adder(h, hashed_to_add):
     return h
 
 
+def _hash_tuples(vals, encoding, hash_key):
+    """
+    Hash an MultiIndex / array_of_tuples efficiently
+
+    Parameters
+    ----------
+    vals : MultiIndex or ndarray of tuples
+    encoding : string, default 'utf8'
+    hash_key : string key to encode, default to _default_hash_key
+
+    Returns
+    -------
+    ndarray of hashed values array, same size as len(c)
+    """
+
+    if not isinstance(vals, MultiIndex):
+        vals = MultiIndex.from_tuples(vals)
+
+    # efficiently turn us into a DataFrame and hash
+    return hash_pandas_object(vals.to_dataframe(index=False),
+                              index=False, encoding=encoding,
+                              hash_key=hash_key, categorize=False)
+
+
 def _hash_categorical(c, encoding, hash_key):
     """
     Hash a Categorical by hashing its categories, and then mapping the codes
@@ -127,6 +155,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     if hash_key is None:
         hash_key = _default_hash_key
 
+    if isinstance(vals, list) and len(vals) and isinstance(vals[0], tuple):
+        # we hash an list of tuples similar to a MultiIndex
+        return _hash_tuples(vals, encoding, hash_key).values
+
     # For categoricals, we hash the categories, then remap the codes to the
     # hash values. (This check is above the complex check so that we don't ask
     # numpy if categorical is a subdtype of complex, as it will choke.
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
index 7913706f5658b..211192261db3e 100644
--- a/pandas/tools/tests/test_hashing.py
+++ b/pandas/tools/tests/test_hashing.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 
-from pandas import DataFrame, Series, Index
+from pandas import DataFrame, Series, Index, MultiIndex
 from pandas.tools.hashing import hash_array, hash_pandas_object
 import pandas.util.testing as tm
 
@@ -55,6 +55,18 @@ def check_not_equal_with_index(self, obj):
             b = hash_pandas_object(obj, index=False)
             self.assertFalse((a == b).all())
 
+    def test_hash_list_tuples(self):
+        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
+        result = hash_array(tups)
+        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
+        self.assert_numpy_array_equal(result, expected)
+
+    def test_multiindex_unique(self):
+        mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)])
+        self.assertTrue(mi.is_unique)
+        result = hash_pandas_object(mi)
+        self.assertTrue(result.is_unique)
+
     def test_hash_pandas_object(self):
 
         for obj in [Series([1, 2, 3]),
@@ -72,7 +84,11 @@ def test_hash_pandas_object(self):
                     tm.makeMixedDataFrame(),
                     tm.makeTimeDataFrame(),
                     tm.makeTimeSeries(),
-                    tm.makeTimedeltaIndex()]:
+                    tm.makeTimedeltaIndex(),
+                    MultiIndex.from_product(
+                        [range(5),
+                         ['foo', 'bar', 'baz'],
+                         pd.date_range('20130101', periods=2)])]:
             self.check_equal(obj)
             self.check_not_equal_with_index(obj)
 
@@ -140,14 +156,6 @@ def f():
             hash_pandas_object(obj)
         self.assertRaises(TypeError, f)
 
-        # MultiIndex are represented as tuples
-        obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
-            [('a', 1), ('a', 2), ('b', 1)]))
-
-        def f():
-            hash_pandas_object(obj)
-        self.assertRaises(TypeError, f)
-
     def test_alread_encoded(self):
         # if already encoded then ok
 

From 44e9c7dc1d39d7f8553835cb1e54775a24c48848 Mon Sep 17 00:00:00 2001
From: Mike Graham <mikegraham2gmail.com>
Date: Wed, 25 Jan 2017 12:37:07 -0500
Subject: [PATCH 2/7] wipSteal the algorithm used to combine hashes from
 tupleobject.c

---
 pandas/tools/hashing.py            | 61 ++++++++++++++++++------------
 pandas/tools/tests/test_hashing.py |  3 +-
 2 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
index 4acf8f036a360..b6635b3d8e053 100644
--- a/pandas/tools/hashing.py
+++ b/pandas/tools/hashing.py
@@ -1,19 +1,34 @@
 """
 data hash pandas / numpy objects
 """
+import itertools
 
 import numpy as np
 from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
 from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
 from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
-                                 is_datetime64_dtype, is_timedelta64_dtype,
-                                 is_object_dtype)
+                                 is_datetime64_dtype, is_timedelta64_dtype)
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
 
 
+def _combine_hash_arrays(arrays, num_items):
+    first = next(arrays)
+    arrays = itertools.chain([first], arrays)
+
+    mult = np.zeros_like(first) + np.uint64(1000003)
+    out = np.zeros_like(first) + np.uint64(0x345678)
+    for i, a in enumerate(arrays):
+        inverse_i = num_items - i
+        out = (out ^ a) * mult
+        mult += np.uint64(82520 + inverse_i + inverse_i)
+    assert i + 1 == num_items, 'Fed in wrong num_items'
+    out += np.uint64(97531)
+    return out
+
+
 def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
                        categorize=True):
     """
@@ -42,10 +57,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
     if hash_key is None:
         hash_key = _default_hash_key
 
-    def adder(h, hashed_to_add):
-        h = np.multiply(h, np.uint(3), h)
-        return np.add(h, hashed_to_add, h)
-
     if isinstance(obj, MultiIndex):
         return _hash_tuples(obj, encoding, hash_key)
 
@@ -57,26 +68,28 @@ def adder(h, hashed_to_add):
         h = hash_array(obj.values, encoding, hash_key,
                        categorize).astype('uint64')
         if index:
-            h = adder(h, hash_pandas_object(obj.index,
-                                            index=False,
-                                            encoding=encoding,
-                                            hash_key=hash_key,
-                                            categorize=categorize).values)
+            h = _combine_hash_arrays(iter([
+                h,
+                hash_pandas_object(obj.index,
+                                   index=False,
+                                   encoding=encoding,
+                                   hash_key=hash_key,
+                                   categorize=categorize).values]),
+                2)
         h = Series(h, index=obj.index, dtype='uint64')
     elif isinstance(obj, ABCDataFrame):
-        cols = obj.iteritems()
-        first_series = next(cols)[1]
-        h = hash_array(first_series.values, encoding,
-                       hash_key, categorize).astype('uint64')
-        for _, col in cols:
-            h = adder(h, hash_array(col.values, encoding, hash_key,
-                                    categorize))
+        hashes = (hash_array(series.values) for _, series in obj.iteritems())
+        num_items = len(obj.columns)
         if index:
-            h = adder(h, hash_pandas_object(obj.index,
-                                            index=False,
-                                            encoding=encoding,
-                                            hash_key=hash_key,
-                                            categorize=categorize).values)
+            index_hash_generator = (hash_pandas_object(obj.index,
+                                                       index=False,
+                                                       encoding=encoding,
+                                                       hash_key=hash_key,
+                                                       categorize=categorize).values  # noqa
+                                    for _ in [None])
+            num_items += 1
+            hashes = itertools.chain(hashes, index_hash_generator)
+        h = _combine_hash_arrays(hashes, num_items)
 
         h = Series(h, index=obj.index, dtype='uint64')
     else:
@@ -103,7 +116,7 @@ def _hash_tuples(vals, encoding, hash_key):
         vals = MultiIndex.from_tuples(vals)
 
     # efficiently turn us into a DataFrame and hash
-    return hash_pandas_object(vals.to_dataframe(index=False),
+    return hash_pandas_object(vals.to_frame(index=False),
                               index=False, encoding=encoding,
                               hash_key=hash_key, categorize=False)
 
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
index 211192261db3e..4a2cb93899d21 100644
--- a/pandas/tools/tests/test_hashing.py
+++ b/pandas/tools/tests/test_hashing.py
@@ -62,7 +62,8 @@ def test_hash_list_tuples(self):
         self.assert_numpy_array_equal(result, expected)
 
     def test_multiindex_unique(self):
-        mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)])
+        mi = MultiIndex.from_tuples([(118, 472), (236, 118),
+                                     (51, 204), (102, 51)])
         self.assertTrue(mi.is_unique)
         result = hash_pandas_object(mi)
         self.assertTrue(result.is_unique)

From e8dd6072430e433185b030f946a1cd83f8372511 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 25 Jan 2017 16:22:59 -0500
Subject: [PATCH 3/7] add hash_tuples

---
 doc/source/whatsnew/v0.20.0.txt    |  1 +
 pandas/tools/hashing.py            | 88 +++++++++++++++++++++++-------
 pandas/tools/tests/test_hashing.py |  9 ++-
 3 files changed, 74 insertions(+), 24 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 2dc15f2fe0781..626ed0b1bac61 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -145,6 +145,7 @@ Other enhancements
 - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)
 - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`).
 - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`).
+- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`)
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
 
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
index b6635b3d8e053..6b220f19b3dff 100644
--- a/pandas/tools/hashing.py
+++ b/pandas/tools/hashing.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
+import pandas.core.algorithms as algos
 from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
 from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
@@ -58,15 +59,16 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
         hash_key = _default_hash_key
 
     if isinstance(obj, MultiIndex):
-        return _hash_tuples(obj, encoding, hash_key)
+        return Series(hash_tuples(obj, encoding, hash_key),
+                      dtype='uint64', copy=False)
 
     if isinstance(obj, ABCIndexClass):
         h = hash_array(obj.values, encoding, hash_key,
-                       categorize).astype('uint64')
-        h = Series(h, index=obj, dtype='uint64')
+                       categorize).astype('uint64', copy=False)
+        h = Series(h, index=obj, dtype='uint64', copy=False)
     elif isinstance(obj, ABCSeries):
         h = hash_array(obj.values, encoding, hash_key,
-                       categorize).astype('uint64')
+                       categorize).astype('uint64', copy=False)
         if index:
             h = _combine_hash_arrays(iter([
                 h,
@@ -76,7 +78,7 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
                                    hash_key=hash_key,
                                    categorize=categorize).values]),
                 2)
-        h = Series(h, index=obj.index, dtype='uint64')
+        h = Series(h, index=obj.index, dtype='uint64', copy=False)
     elif isinstance(obj, ABCDataFrame):
         hashes = (hash_array(series.values) for _, series in obj.iteritems())
         num_items = len(obj.columns)
@@ -91,34 +93,81 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
             hashes = itertools.chain(hashes, index_hash_generator)
         h = _combine_hash_arrays(hashes, num_items)
 
-        h = Series(h, index=obj.index, dtype='uint64')
+        h = Series(h, index=obj.index, dtype='uint64', copy=False)
     else:
         raise TypeError("Unexpected type for hashing %s" % type(obj))
     return h
 
 
-def _hash_tuples(vals, encoding, hash_key):
+def _hash_lists(vals, encoding='utf8', hash_key=None):
+    """
+
+    Parameters
+    ----------
+    vals : list of ndarrays
+    encoding : string, default 'utf8'
+        encoding for data & key when strings
+    hash_key : string key to encode, default to _default_hash_key
+
+    Returns
+    -------
+    1d uint64 numpy array of hash values, same length as the vals[0]
+    """
+
+    if not isinstance(vals, list):
+        raise TypeError("only can accept lists")
+
+    if not len(vals):
+        raise ValueError("must pass a non-zero length vals")
+
+    if not isinstance(vals[0], np.ndarray):
+        raise ValueError("must pass a ndarray")
+
+    hashes = (hash_array(l, encoding=encoding, hash_key=hash_key)
+              for l in vals)
+    h = _combine_hash_arrays(hashes, len(vals))
+    return h
+
+
+def hash_tuples(vals, encoding='utf8', hash_key=None):
     """
     Hash an MultiIndex / array_of_tuples efficiently
 
     Parameters
     ----------
-    vals : MultiIndex or ndarray of tuples
+    vals : MultiIndex, ndarray of tuples, or single tuple
     encoding : string, default 'utf8'
     hash_key : string key to encode, default to _default_hash_key
 
     Returns
     -------
-    ndarray of hashed values array, same size as len(c)
+    ndarray of hashed values array
     """
 
+    is_tuple = False
+    if isinstance(vals, tuple):
+        vals = [vals]
+        is_tuple = True
+
     if not isinstance(vals, MultiIndex):
         vals = MultiIndex.from_tuples(vals)
 
-    # efficiently turn us into a DataFrame and hash
-    return hash_pandas_object(vals.to_frame(index=False),
-                              index=False, encoding=encoding,
-                              hash_key=hash_key, categorize=False)
+    # create a list-of-ndarrays & hash
+    def get_level_values(num):
+        unique = vals.levels[num]  # .values
+        labels = vals.labels[num]
+        filled = algos.take_1d(unique.values, labels,
+                               fill_value=unique._na_value)
+        return filled
+
+    vals = [get_level_values(level)
+            for level in range(vals.nlevels)]
+
+    result = _hash_lists(vals, encoding=encoding, hash_key=hash_key)
+    if is_tuple:
+        result = result[0]
+
+    return result
 
 
 def _hash_categorical(c, encoding, hash_key):
@@ -138,7 +187,7 @@ def _hash_categorical(c, encoding, hash_key):
     """
     cat_hashed = hash_array(c.categories.values, encoding, hash_key,
                             categorize=False).astype(np.uint64, copy=False)
-    return c.rename_categories(cat_hashed).astype(np.uint64)
+    return c.rename_categories(cat_hashed).astype(np.uint64, copy=False)
 
 
 def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
@@ -168,10 +217,6 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     if hash_key is None:
         hash_key = _default_hash_key
 
-    if isinstance(vals, list) and len(vals) and isinstance(vals[0], tuple):
-        # we hash an list of tuples similar to a MultiIndex
-        return _hash_tuples(vals, encoding, hash_key).values
-
     # For categoricals, we hash the categories, then remap the codes to the
     # hash values. (This check is above the complex check so that we don't ask
     # numpy if categorical is a subdtype of complex, as it will choke.
@@ -187,9 +232,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     # manage it.
     if is_bool_array(vals):
         vals = vals.astype('u8')
-    elif ((is_datetime64_dtype(vals) or
-           is_timedelta64_dtype(vals) or
-           is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
+    elif (is_datetime64_dtype(vals) or
+          is_timedelta64_dtype(vals)):
+        vals = vals.view('i8').astype('u8', copy=False)
+    elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
         vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
     else:
         # With repeated values, its MUCH faster to categorize object dtypes,
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
index 4a2cb93899d21..0eb7f5ef6c8ad 100644
--- a/pandas/tools/tests/test_hashing.py
+++ b/pandas/tools/tests/test_hashing.py
@@ -2,7 +2,7 @@
 import pandas as pd
 
 from pandas import DataFrame, Series, Index, MultiIndex
-from pandas.tools.hashing import hash_array, hash_pandas_object
+from pandas.tools.hashing import hash_array, hash_tuples, hash_pandas_object
 import pandas.util.testing as tm
 
 
@@ -55,12 +55,15 @@ def check_not_equal_with_index(self, obj):
             b = hash_pandas_object(obj, index=False)
             self.assertFalse((a == b).all())
 
-    def test_hash_list_tuples(self):
+    def test_hash_tuples(self):
         tups = [(1, 'one'), (1, 'two'), (2, 'one')]
-        result = hash_array(tups)
+        result = hash_tuples(tups)
         expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
         self.assert_numpy_array_equal(result, expected)
 
+        result = hash_tuples(tups[0])
+        self.assertEqual(result, expected[0])
+
     def test_multiindex_unique(self):
         mi = MultiIndex.from_tuples([(118, 472), (236, 118),
                                      (51, 204), (102, 51)])

From 0c13df7ea55d5018848708abfee6ff572855689c Mon Sep 17 00:00:00 2001
From: Mike Graham <mikegraham2gmail.com>
Date: Wed, 25 Jan 2017 12:37:07 -0500
Subject: [PATCH 4/7] Steal the algorithm used to combine hashes from
 tupleobject.c

---
 pandas/tools/hashing.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
index 6b220f19b3dff..281fbcaf0c22d 100644
--- a/pandas/tools/hashing.py
+++ b/pandas/tools/hashing.py
@@ -16,6 +16,7 @@
 
 
 def _combine_hash_arrays(arrays, num_items):
+    "Should be the same as CPython's tupleobject.c"
     first = next(arrays)
     arrays = itertools.chain([first], arrays)
 
@@ -23,7 +24,8 @@ def _combine_hash_arrays(arrays, num_items):
     out = np.zeros_like(first) + np.uint64(0x345678)
     for i, a in enumerate(arrays):
         inverse_i = num_items - i
-        out = (out ^ a) * mult
+        out ^= a
+        out *= mult
         mult += np.uint64(82520 + inverse_i + inverse_i)
     assert i + 1 == num_items, 'Fed in wrong num_items'
     out += np.uint64(97531)
@@ -70,15 +72,17 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
         h = hash_array(obj.values, encoding, hash_key,
                        categorize).astype('uint64', copy=False)
         if index:
-            h = _combine_hash_arrays(iter([
-                h,
-                hash_pandas_object(obj.index,
-                                   index=False,
-                                   encoding=encoding,
-                                   hash_key=hash_key,
-                                   categorize=categorize).values]),
-                2)
+            index_iter = (hash_pandas_object(obj.index,
+                                             index=False,
+                                             encoding=encoding,
+                                             hash_key=hash_key,
+                                             categorize=categorize).values
+                          for _ in [None])
+            arrays = itertools.chain([h], index_iter)
+            h = _combine_hash_arrays(arrays, 2)
+
         h = Series(h, index=obj.index, dtype='uint64', copy=False)
+
     elif isinstance(obj, ABCDataFrame):
         hashes = (hash_array(series.values) for _, series in obj.iteritems())
         num_items = len(obj.columns)

From 58f682d369e53f74830a3247f8976a8c6cc296e5 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Thu, 26 Jan 2017 09:12:27 -0500
Subject: [PATCH 5/7] memory optimization

---
 pandas/tools/hashing.py            | 26 ++++++++++++++++++++------
 pandas/tools/tests/test_hashing.py | 10 +++++++++-
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
index 281fbcaf0c22d..e3645fab9a164 100644
--- a/pandas/tools/hashing.py
+++ b/pandas/tools/hashing.py
@@ -9,18 +9,30 @@
 from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
 from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
-                                 is_datetime64_dtype, is_timedelta64_dtype)
+                                 is_datetime64_dtype, is_timedelta64_dtype,
+                                 is_list_like)
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
 
 
 def _combine_hash_arrays(arrays, num_items):
-    "Should be the same as CPython's tupleobject.c"
-    first = next(arrays)
+    """
+    Parameters
+    ----------
+    arrays : generator
+    num_items : int
+
+    Should be the same as CPython's tupleobject.c
+    """
+    try:
+        first = next(arrays)
+    except StopIteration:
+        return np.array([], dtype=np.uint64)
+
     arrays = itertools.chain([first], arrays)
 
-    mult = np.zeros_like(first) + np.uint64(1000003)
+    mult = np.uint64(1000003)
     out = np.zeros_like(first) + np.uint64(0x345678)
     for i, a in enumerate(arrays):
         inverse_i = num_items - i
@@ -135,11 +147,11 @@ def _hash_lists(vals, encoding='utf8', hash_key=None):
 
 def hash_tuples(vals, encoding='utf8', hash_key=None):
     """
-    Hash an MultiIndex / array_of_tuples efficiently
+    Hash an MultiIndex / list-of-tuples efficiently
 
     Parameters
     ----------
-    vals : MultiIndex, ndarray of tuples, or single tuple
+    vals : MultiIndex, list-of-tuples, or single tuple
     encoding : string, default 'utf8'
     hash_key : string key to encode, default to _default_hash_key
 
@@ -152,6 +164,8 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
     if isinstance(vals, tuple):
         vals = [vals]
         is_tuple = True
+    elif not is_list_like(vals):
+        raise TypeError("must be convertible to a list-of-tuples")
 
     if not isinstance(vals, MultiIndex):
         vals = MultiIndex.from_tuples(vals)
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
index 0eb7f5ef6c8ad..0deb5c8b89396 100644
--- a/pandas/tools/tests/test_hashing.py
+++ b/pandas/tools/tests/test_hashing.py
@@ -53,7 +53,8 @@ def check_not_equal_with_index(self, obj):
         if not isinstance(obj, Index):
             a = hash_pandas_object(obj, index=True)
             b = hash_pandas_object(obj, index=False)
-            self.assertFalse((a == b).all())
+            if len(obj):
+                self.assertFalse((a == b).all())
 
     def test_hash_tuples(self):
         tups = [(1, 'one'), (1, 'two'), (2, 'one')]
@@ -64,6 +65,11 @@ def test_hash_tuples(self):
         result = hash_tuples(tups[0])
         self.assertEqual(result, expected[0])
 
+    def test_hash_tuples_err(self):
+
+        for val in [5, 'foo', pd.Timestamp('20130101')]:
+            self.assertRaises(TypeError, hash_tuples, val)
+
     def test_multiindex_unique(self):
         mi = MultiIndex.from_tuples([(118, 472), (236, 118),
                                      (51, 204), (102, 51)])
@@ -81,9 +87,11 @@ def test_hash_pandas_object(self):
                     Series(['a', np.nan, 'c']),
                     Series(['a', None, 'c']),
                     Series([True, False, True]),
+                    Series(),
                     Index([1, 2, 3]),
                     Index([True, False, True]),
                     DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
+                    DataFrame(),
                     tm.makeMissingDataframe(),
                     tm.makeMixedDataFrame(),
                     tm.makeTimeDataFrame(),

From 48a2402f02b6430e60a48f3b1ff0bb5248dce638 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Thu, 26 Jan 2017 18:26:09 -0500
Subject: [PATCH 6/7] support for mixed type arrays

---
 pandas/tools/hashing.py            | 10 +++++++++-
 pandas/tools/tests/test_hashing.py | 14 +++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
index e3645fab9a164..12228afe5e2c1 100644
--- a/pandas/tools/hashing.py
+++ b/pandas/tools/hashing.py
@@ -149,6 +149,8 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
     """
     Hash an MultiIndex / list-of-tuples efficiently
 
+    .. versionadded:: 0.20.0
+
     Parameters
     ----------
     vals : MultiIndex, list-of-tuples, or single tuple
@@ -265,7 +267,13 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
                               ordered=False, fastpath=True)
             return _hash_categorical(cat, encoding, hash_key)
 
-        vals = _hash.hash_object_array(vals, hash_key, encoding)
+        try:
+            vals = _hash.hash_object_array(vals, hash_key, encoding)
+        except TypeError:
+
+            # we have mixed types
+            vals = _hash.hash_object_array(vals.astype(str).astype(object),
+                                           hash_key, encoding)
 
     # Then, redistribute these 64-bit ints within the space of 64-bit ints
     vals ^= vals >> 30
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
index 0deb5c8b89396..9faed167b5a9a 100644
--- a/pandas/tools/tests/test_hashing.py
+++ b/pandas/tools/tests/test_hashing.py
@@ -36,6 +36,11 @@ def test_hash_array(self):
             a = s.values
             tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
 
+    def test_hash_array_mixed(self):
+        for data in [np.array([3, 4, 'All']),
+                     np.array([3, 4, 'All'], dtype=object)]:
+            tm.assert_numpy_array_equal(hash_array(data), hash_array(data))
+
     def check_equal(self, obj, **kwargs):
         a = hash_pandas_object(obj, **kwargs)
         b = hash_pandas_object(obj, **kwargs)
@@ -159,15 +164,6 @@ def f():
             hash_pandas_object(Series(list('abc')), hash_key='foo')
         self.assertRaises(ValueError, f)
 
-    def test_unsupported_objects(self):
-
-        # mixed objects are not supported
-        obj = Series(['1', 2, 3])
-
-        def f():
-            hash_pandas_object(obj)
-        self.assertRaises(TypeError, f)
-
     def test_alread_encoded(self):
         # if already encoded then ok
 

From 8b1d3f9c5718799b5b0f2e1d8069d16fab01324a Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Thu, 26 Jan 2017 18:45:51 -0500
Subject: [PATCH 7/7] not correctly hashing categorical in a MI

---
 pandas/tools/hashing.py            | 43 ++++++------------------------
 pandas/tools/tests/test_hashing.py |  9 ++++++-
 2 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
index 12228afe5e2c1..a62c80c6f8d67 100644
--- a/pandas/tools/hashing.py
+++ b/pandas/tools/hashing.py
@@ -115,36 +115,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
     return h
 
 
-def _hash_lists(vals, encoding='utf8', hash_key=None):
-    """
-
-    Parameters
-    ----------
-    vals : list of ndarrays
-    encoding : string, default 'utf8'
-        encoding for data & key when strings
-    hash_key : string key to encode, default to _default_hash_key
-
-    Returns
-    -------
-    1d uint64 numpy array of hash values, same length as the vals[0]
-    """
-
-    if not isinstance(vals, list):
-        raise TypeError("only can accept lists")
-
-    if not len(vals):
-        raise ValueError("must pass a non-zero length vals")
-
-    if not isinstance(vals[0], np.ndarray):
-        raise ValueError("must pass a ndarray")
-
-    hashes = (hash_array(l, encoding=encoding, hash_key=hash_key)
-              for l in vals)
-    h = _combine_hash_arrays(hashes, len(vals))
-    return h
-
-
 def hash_tuples(vals, encoding='utf8', hash_key=None):
     """
     Hash an MultiIndex / list-of-tuples efficiently
@@ -172,22 +142,25 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
     if not isinstance(vals, MultiIndex):
         vals = MultiIndex.from_tuples(vals)
 
-    # create a list-of-ndarrays & hash
+    # create a list-of-ndarrays
     def get_level_values(num):
         unique = vals.levels[num]  # .values
         labels = vals.labels[num]
-        filled = algos.take_1d(unique.values, labels,
+        filled = algos.take_1d(unique._values, labels,
                                fill_value=unique._na_value)
         return filled
 
     vals = [get_level_values(level)
             for level in range(vals.nlevels)]
 
-    result = _hash_lists(vals, encoding=encoding, hash_key=hash_key)
+    # hash the list-of-ndarrays
+    hashes = (hash_array(l, encoding=encoding, hash_key=hash_key)
+              for l in vals)
+    h = _combine_hash_arrays(hashes, len(vals))
     if is_tuple:
-        result = result[0]
+        h = h[0]
 
-    return result
+    return h
 
 
 def _hash_categorical(c, encoding, hash_key):
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
index 9faed167b5a9a..ed5a74f8cfcf2 100644
--- a/pandas/tools/tests/test_hashing.py
+++ b/pandas/tools/tests/test_hashing.py
@@ -102,10 +102,17 @@ def test_hash_pandas_object(self):
                     tm.makeTimeDataFrame(),
                     tm.makeTimeSeries(),
                     tm.makeTimedeltaIndex(),
+                    tm.makePeriodIndex(),
+                    Series(tm.makePeriodIndex()),
+                    Series(pd.date_range('20130101',
+                                         periods=3, tz='US/Eastern')),
                     MultiIndex.from_product(
                         [range(5),
                          ['foo', 'bar', 'baz'],
-                         pd.date_range('20130101', periods=2)])]:
+                         pd.date_range('20130101', periods=2)]),
+                    MultiIndex.from_product(
+                        [pd.CategoricalIndex(list('aabc')),
+                         range(3)])]:
             self.check_equal(obj)
             self.check_not_equal_with_index(obj)