Skip to content

Commit bd35efc

Browse files
committed
ENH: add data hashing routines
xref dask/dask#1807
1 parent 22d982a commit bd35efc

File tree

5 files changed

+210
-2
lines changed

5 files changed

+210
-2
lines changed

doc/source/whatsnew/v0.19.2.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ Highlights include:
1616
:backlinks: none
1717

1818

19+
.. _whatsnew_0192.enhancements:
20+
21+
Enhancements
22+
~~~~~~~~~~~~
23+
- Routines to provide a hash for pandas objects (:issue:``)
24+
25+
1926
.. _whatsnew_0192.performance:
2027

2128
Performance Improvements

pandas/core/base.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pandas.util.decorators import (Appender, cache_readonly,
1717
deprecate_kwarg, Substitution)
1818
from pandas.core.common import AbstractMethodError
19+
from pandas.tools.hashing import hash_pandas_object
1920
from pandas.formats.printing import pprint_thing
2021

2122
_shared_docs = dict()
@@ -795,7 +796,50 @@ def __unicode__(self):
795796
return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype)
796797

797798

798-
class IndexOpsMixin(object):
799+
class HashableMixin(object):
800+
""" provide methods for hashable pandas objects """
801+
802+
def hash(self, index=True):
803+
"""
804+
Return a data hash of the Series/DataFrame
805+
806+
Parameters
807+
----------
808+
index : boolean, default True
809+
include the index in the hash (if Series/DataFrame)
810+
811+
Returns
812+
-------
813+
1d uint64 numpy array of hash values, same length as the
814+
object
815+
816+
Examples
817+
--------
818+
>>> pd.Index([1, 2, 3]).hash()
819+
array([6238072747940578789, 15839785061582574730,
820+
2185194620014831856], dtype=uint64)
821+
822+
>>> pd.Series([1, 2, 3]).hash()
823+
array([267474170112184751, 16863939785269199747,
824+
3948624847917518682], dtype=uint64)
825+
826+
>>> pd.Series([1, 2, 3]).hash(index=False)
827+
array([6238072747940578789, 15839785061582574730,
828+
2185194620014831856], dtype=uint64)
829+
830+
>>> pd.DataFrame({'A': [1, 2, 3]}).hash()
831+
array([267474170112184751, 16863939785269199747,
832+
3948624847917518682], dtype=uint64)
833+
834+
>>> pd.DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}).hash()
835+
array([10443389771536978168, 14762841401800935363,
836+
9751253963311919054], dtype=uint64)
837+
838+
"""
839+
return hash_pandas_object(self, index=index)
840+
841+
842+
class IndexOpsMixin(HashableMixin):
799843
""" common ops mixin to support a unified inteface / docs for Series /
800844
Index
801845
"""

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@
201201
# DataFrame class
202202

203203

204-
class DataFrame(NDFrame):
204+
class DataFrame(NDFrame, base.HashableMixin):
205205
""" Two-dimensional size-mutable, potentially heterogeneous tabular data
206206
structure with labeled axes (rows and columns). Arithmetic operations
207207
align on both row and column labels. Can be thought of as a dict-like

pandas/tools/hashing.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
"""
2+
data hash pandas / numpy objects
3+
"""
4+
5+
import numpy as np
6+
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
7+
from pandas.types.common import is_categorical_dtype
8+
9+
10+
def hash_pandas_object(obj, index=True):
11+
"""
12+
Return a data hash of the Index/Series/DataFrame
13+
14+
Parameters
15+
----------
16+
index : boolean, default True
17+
include the index in the hash (if Series/DataFrame)
18+
19+
Returns
20+
-------
21+
1d uint64 numpy array of hash values, same length as the
22+
object
23+
"""
24+
25+
def adder(h, hashed_to_add):
26+
h = np.multiply(h, np.uint(3), h)
27+
return np.add(h, hashed_to_add, h)
28+
29+
if isinstance(obj, ABCIndexClass):
30+
h = hash_array(obj.values).astype('uint64')
31+
elif isinstance(obj, ABCSeries):
32+
h = hash_array(obj.values).astype('uint64')
33+
if index:
34+
h = adder(h, hash_pandas_object(obj.index))
35+
elif isinstance(obj, ABCDataFrame):
36+
cols = obj.iteritems()
37+
first_series = next(cols)[1]
38+
h = hash_array(first_series.values).astype('uint64')
39+
for _, col in cols:
40+
h = adder(h, hash_array(col.values))
41+
if index:
42+
h = adder(h, hash_pandas_object(obj.index))
43+
else:
44+
raise TypeError("Unexpected type %s" % type(obj))
45+
return h
46+
47+
48+
def hash_array(vals):
49+
"""Given a 1d array, return an array of deterministic integers."""
50+
# work with cagegoricals as ints. (This check is above the complex
51+
# check so that we don't ask numpy if categorical is a subdtype of
52+
# complex, as it will choke.
53+
if is_categorical_dtype(vals.dtype):
54+
vals = vals.codes
55+
56+
# we'll be working with everything as 64-bit values, so handle this
57+
# 128-bit value early
58+
if np.issubdtype(vals.dtype, np.complex128):
59+
return hash_array(vals.real) + 23 * hash_array(vals.imag)
60+
61+
# MAIN LOGIC:
62+
63+
# First, turn whatever array this is into unsigned 64-bit ints, if we can
64+
# manage it.
65+
if vals.dtype == np.bool:
66+
vals = vals.astype('u8')
67+
68+
if (np.issubdtype(vals.dtype, np.datetime64) or
69+
np.issubdtype(vals.dtype, np.timedelta64) or
70+
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
71+
72+
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
73+
else:
74+
vals = np.array([hash(x) for x in vals], dtype=np.uint64)
75+
76+
# Then, redistribute these 64-bit ints within the space of 64-bit ints
77+
vals ^= vals >> 30
78+
vals *= np.uint64(0xbf58476d1ce4e5b9)
79+
vals ^= vals >> 27
80+
vals *= np.uint64(0x94d049bb133111eb)
81+
vals ^= vals >> 31
82+
return vals

pandas/tools/tests/test_hashing.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from pandas import DataFrame, Series, Index
5+
from pandas.tools.hashing import hash_array, hash_pandas_object
6+
import pandas.util.testing as tm
7+
8+
9+
class TestHashing(tm.TestCase):
10+
11+
_multiprocess_can_split_ = True
12+
13+
def setUp(self):
14+
self.df = DataFrame(
15+
{'i32': np.array([1, 2, 3] * 3, dtype='int32'),
16+
'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
17+
'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
18+
'obj': Series(['d', 'e', 'f'] * 3),
19+
'bool': np.array([True, False, True] * 3),
20+
'dt': Series(pd.date_range('20130101', periods=9)),
21+
'dt_tz': Series(pd.date_range('20130101', periods=9,
22+
tz='US/Eastern')),
23+
'td': Series(pd.timedelta_range('2000', periods=9))})
24+
25+
def test_hash_array(self):
26+
for name, s in self.df.iteritems():
27+
a = s.values
28+
tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
29+
30+
def check_equal(self, obj):
31+
a = hash_pandas_object(obj)
32+
b = hash_pandas_object(obj)
33+
tm.assert_numpy_array_equal(a, b)
34+
35+
a = obj.hash()
36+
b = obj.hash()
37+
tm.assert_numpy_array_equal(a, b)
38+
39+
a = obj.hash(index=False)
40+
b = obj.hash(index=False)
41+
tm.assert_numpy_array_equal(a, b)
42+
43+
# check that we are not hashing the same if
44+
# we include the index
45+
if not isinstance(obj, Index):
46+
a = obj.hash(index=True)
47+
b = obj.hash(index=False)
48+
self.assertFalse((a == b).all())
49+
50+
def test_hash_pandas_object(self):
51+
52+
for obj in [Series([1, 2, 3]),
53+
Series([1.0, 1.5, 3.2]),
54+
Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
55+
Series(['a', 'b', 'c']),
56+
Series([True, False, True]),
57+
Index([1, 2, 3]),
58+
Index([True, False, True]),
59+
DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
60+
tm.makeMissingDataframe(),
61+
tm.makeMixedDataFrame(),
62+
tm.makeTimeDataFrame(),
63+
tm.makeTimeSeries(),
64+
tm.makeTimedeltaIndex(),
65+
Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
66+
[('a', 1), ('a', 2), ('b', 1)]))]:
67+
self.check_equal(obj)
68+
69+
def test_errors(self):
70+
71+
for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
72+
def f():
73+
hash_pandas_object(f)
74+
75+
self.assertRaises(TypeError, f)

0 commit comments

Comments
 (0)