Skip to content

Commit f6a54f7

Browse files
committed
ENH: add data hashing routines
xref dask/dask#1807
1 parent 22d982a commit f6a54f7

File tree

5 files changed

+166
-2
lines changed

5 files changed

+166
-2
lines changed

doc/source/whatsnew/v0.19.2.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ Highlights include:
1616
:backlinks: none
1717

1818

19+
.. _whatsnew_0192.enhancements:
20+
21+
Enhancements
22+
~~~~~~~~~~~~
23+
- Routines to provide a hash for pandas objects (:issue:``)
24+
25+
1926
.. _whatsnew_0192.performance:
2027

2128
Performance Improvements

pandas/core/base.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -795,7 +795,42 @@ def __unicode__(self):
795795
return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype)
796796

797797

798-
class IndexOpsMixin(object):
798+
class HashableMixin(object):
799+
""" provide methods for hashable pandas objects """
800+
801+
def hash(self):
802+
"""
803+
Return a data hash of the Series/DataFrame
804+
805+
Returns
806+
-------
807+
1d uint64 numpy array of hash values, same length as the
808+
object
809+
810+
Examples
811+
--------
812+
>>> pd.Index([1, 2, 3]).hash()
813+
array([6238072747940578789, 15839785061582574730,
814+
2185194620014831856], dtype=uint64)
815+
816+
>>> pd.Series([1,2,3]).hash()
817+
array([6238072747940578789, 15839785061582574730,
818+
2185194620014831856], dtype=uint64)
819+
820+
>>> pd.DataFrame({'A': [1,2,3]}).hash()
821+
array([6238072747940578789, 15839785061582574730,
822+
2185194620014831856], dtype=uint64)
823+
824+
>>> pd.DataFrame({'A': [1,2,3], 'B': ['foo', 'bar', 'baz']}).hash()
825+
array([13816314034815948011, 13993887747723865506,
826+
2896873748383756390], dtype=uint64)
827+
828+
"""
829+
from pandas.tools.hashing import hash_pandas_object
830+
return hash_pandas_object(self)
831+
832+
833+
class IndexOpsMixin(HashableMixin):
799834
""" common ops mixin to support a unified inteface / docs for Series /
800835
Index
801836
"""

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@
201201
# DataFrame class
202202

203203

204-
class DataFrame(NDFrame):
204+
class DataFrame(NDFrame, base.HashableMixin):
205205
""" Two-dimensional size-mutable, potentially heterogeneous tabular data
206206
structure with labeled axes (rows and columns). Arithmetic operations
207207
align on both row and column labels. Can be thought of as a dict-like

pandas/tools/hashing.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
data hash pandas / numpy objects
3+
"""
4+
5+
import numpy as np
6+
import pandas as pd
7+
8+
from pandas.types.common import is_categorical_dtype
9+
10+
11+
def hash_pandas_object(obj):
12+
if isinstance(obj, (pd.Series, pd.Index)):
13+
h = hash_array(obj.values).astype('uint64')
14+
elif isinstance(obj, pd.DataFrame):
15+
cols = obj.iteritems()
16+
first_series = next(cols)[1]
17+
h = hash_array(first_series.values).astype('uint64')
18+
for _, col in cols:
19+
h = np.multiply(h, np.uint(3), h)
20+
h = np.add(h, hash_array(col.values), h)
21+
else:
22+
raise TypeError("Unexpected type %s" % type(obj))
23+
return h
24+
25+
26+
def hash_array(vals):
27+
"""Given a 1d array, return an array of deterministic integers."""
28+
# work with cagegoricals as ints. (This check is above the complex
29+
# check so that we don't ask numpy if categorical is a subdtype of
30+
# complex, as it will choke.
31+
if is_categorical_dtype(vals.dtype):
32+
vals = vals.codes
33+
34+
# we'll be working with everything as 64-bit values, so handle this
35+
# 128-bit value early
36+
if np.issubdtype(vals.dtype, np.complex128):
37+
return hash_array(vals.real) + 23 * hash_array(vals.imag)
38+
39+
# MAIN LOGIC:
40+
41+
# First, turn whatever array this is into unsigned 64-bit ints, if we can
42+
# manage it.
43+
if vals.dtype == np.bool:
44+
vals = vals.astype('u8')
45+
46+
if (np.issubdtype(vals.dtype, np.datetime64) or
47+
np.issubdtype(vals.dtype, np.timedelta64) or
48+
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
49+
50+
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
51+
else:
52+
vals = np.array([hash(x) for x in vals], dtype=np.uint64)
53+
54+
# Then, redistribute these 64-bit ints within the space of 64-bit ints
55+
vals ^= vals >> 30
56+
vals *= np.uint64(0xbf58476d1ce4e5b9)
57+
vals ^= vals >> 27
58+
vals *= np.uint64(0x94d049bb133111eb)
59+
vals ^= vals >> 31
60+
return vals

pandas/tools/tests/test_hashing.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from pandas import DataFrame, Series, Index
5+
from pandas.tools.hashing import hash_array, hash_pandas_object
6+
import pandas.util.testing as tm
7+
8+
9+
class TestHashing(tm.TestCase):
10+
11+
_multiprocess_can_split_ = True
12+
13+
def setUp(self):
14+
self.df = DataFrame(
15+
{'i32': np.array([1, 2, 3] * 3, dtype='int32'),
16+
'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
17+
'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
18+
'obj': Series(['d', 'e', 'f'] * 3),
19+
'bool': np.array([True, False, True] * 3),
20+
'dt': Series(pd.date_range('20130101', periods=9)),
21+
'dt_tz': Series(pd.date_range('20130101', periods=9,
22+
tz='US/Eastern')),
23+
'td': Series(pd.timedelta_range('2000', periods=9))})
24+
25+
def test_hash_array(self):
26+
for name, s in self.df.iteritems():
27+
a = s.values
28+
tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
29+
30+
def check_equal(self, obj):
31+
a = hash_pandas_object(obj)
32+
b = hash_pandas_object(obj)
33+
tm.assert_numpy_array_equal(a, b)
34+
35+
a = obj.hash()
36+
b = obj.hash()
37+
tm.assert_numpy_array_equal(a, b)
38+
39+
def test_hash_pandas_object(self):
40+
41+
for obj in [Series([1, 2, 3]),
42+
Series([1.0, 1.5, 3.2]),
43+
Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
44+
Series(['a', 'b', 'c']),
45+
Series([True, False, True]),
46+
Index([1, 2, 3]),
47+
Index([True, False, True]),
48+
DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
49+
tm.makeMissingDataframe(),
50+
tm.makeMixedDataFrame(),
51+
tm.makeTimeDataFrame(),
52+
tm.makeTimeSeries(),
53+
tm.makeTimedeltaIndex()]:
54+
self.check_equal(obj)
55+
56+
def test_errors(self):
57+
58+
for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
59+
def f():
60+
hash_pandas_object(f)
61+
62+
self.assertRaises(TypeError, f)

0 commit comments

Comments
 (0)