pandas-dev
diff --git a/‎asv_bench/benchmarks/algorithms.py
Lines changed: 33 additions & 0 deletions b/‎asv_bench/benchmarks/algorithms.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎doc/source/whatsnew/v0.19.2.txt
Lines changed: 8 additions & 0 deletions b/‎doc/source/whatsnew/v0.19.2.txt
Lines changed: 8 additions & 0 deletions
diff --git a/‎pandas/core/base.py
Lines changed: 75 additions & 1 deletion b/‎pandas/core/base.py
Lines changed: 75 additions & 1 deletion
diff --git a/‎pandas/core/frame.py
Lines changed: 1 addition & 1 deletion b/‎pandas/core/frame.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/src/hash.pyx
Lines changed: 179 additions & 0 deletions b/‎pandas/src/hash.pyx
Lines changed: 179 additions & 0 deletions
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+from pandas.util import testing as tm
 
 
 class algorithm(object):
@@ -55,3 +56,35 @@ def time_add_overflow_neg_arr(self):
 
     def time_add_overflow_mixed_arr(self):
         self.checked_add(self.arr, self.arrmixed)
+
+
+class hashing(object):
+    goal_time = 0.2
+
+    def setup(self):
+        N = 100000
+
+        self.df = pd.DataFrame(
+            {'A': pd.Series(tm.makeStringIndex(100).take(
+                np.random.randint(0, 100, size=N))),
+             'B': pd.Series(tm.makeStringIndex(10000).take(
+                 np.random.randint(0, 10000, size=N))),
+             'D': np.random.randn(N),
+             'E': np.arange(N),
+             'F': pd.date_range('20110101', freq='s', periods=N),
+             'G': pd.timedelta_range('1 day', freq='s', periods=N),
+             })
+        self.df['C'] = self.df['B'].astype('category')
+        self.df.iloc[10:20] = np.nan
+
+    def time_frame(self):
+        self.df.hash()
+
+    def time_series_int(self):
+        self.df.E.hash()
+
+    def time_series_string(self):
+        self.df.B.hash()
+
+    def time_series_categorical(self):
+        self.df.C.hash()
@@ -16,6 +16,14 @@ Highlights include:
     :backlinks: none
 
 
+.. _whatsnew_0192.enhancements:
+
+Enhancements
+~~~~~~~~~~~~
+
+- ``Series/DataFrame/Index`` gain a ``.hash()`` method to provide a data hash (:issue:`14729`)
+
+
 .. _whatsnew_0192.performance:
 
 Performance Improvements
 
@@ -795,7 +795,81 @@ def __unicode__(self):
         return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype)
 
 
-class IndexOpsMixin(object):
+class HashableMixin(object):
+    """ provide methods for hashable pandas objects """
+
+    def hash(self, index=True, encoding='utf8', hash_key=None):
+        """
+        Return a data hash of the Series/DataFrame
+        This is a 1-d array of unique hashses of all of the elements in that
+        row, including the Index if desired.
+
+        Parameters
+        ----------
+        index : boolean, default True
+            include the index in the hash (if Series/DataFrame)
+        encoding : string, default 'utf8'
+            encoding for data & key when strings
+        hash_key : string, must be 16 bytes length if passed
+
+        Returns
+        -------
+        1d uint64 numpy array of hash values, same length as the
+        object
+
+        Examples
+        --------
+        >>> pd.Index([1, 2, 3]).hash()
+        array([ 6238072747940578789, 15839785061582574730,
+               2185194620014831856], dtype=uint64)
+
+        >>> pd.Series([1, 2, 3]).hash()
+        array([  267474170112184751, 16863939785269199747,
+               3948624847917518682], dtype=uint64)
+
+        >>> pd.Series([1, 2, 3]).hash(index=False)
+        array([ 6238072747940578789, 15839785061582574730,
+               2185194620014831856], dtype=uint64)
+
+        >>> pd.DataFrame({'A': [1, 2, 3]}).hash()
+        array([  267474170112184751, 16863939785269199747,
+               3948624847917518682], dtype=uint64)
+
+        >>> pd.DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}).hash()
+        array([11603696091789712533,  5345384428795788641,
+               46691607209239364], dtype=uint64)
+
+
+        Notes
+        -----
+        These functions do not hash attributes attached to the object
+        e.g. name for Index/Series. Nor do they hash the columns of
+        a DataFrame.
+
+        Mixed dtypes within a Series (or a column of a DataFrame) will
+        be stringified, for example.
+
+        >>> Series(['1', 2, 3]).hash()
+        array([ 8973981985592347666, 16940873351292606887,
+               10100427194775696709], dtype=uint64)
+
+        >>> Series(['1', '2', '3']).hash()
+        array([ 8973981985592347666, 16940873351292606887,
+               10100427194775696709], dtype=uint64)
+
+        These have the same data hash, while a pure dtype is different.
+
+        >>> Series([1, 2, 3]).hash()
+        array([  267474170112184751, 16863939785269199747,
+               3948624847917518682], dtype=uint64)
+
+        """
+        from pandas.tools.hashing import hash_pandas_object
+        return hash_pandas_object(self, index=index, encoding=encoding,
+                                  hash_key=hash_key)
+
+
+class IndexOpsMixin(HashableMixin):
     """ common ops mixin to support a unified inteface / docs for Series /
     Index
     """
 
@@ -201,7 +201,7 @@
 # DataFrame class
 
 
-class DataFrame(NDFrame):
+class DataFrame(NDFrame, base.HashableMixin):
     """ Two-dimensional size-mutable, potentially heterogeneous tabular data
     structure with labeled axes (rows and columns). Arithmetic operations
     align on both row and column labels. Can be thought of as a dict-like
 
@@ -0,0 +1,179 @@
+# cython: profile=False
+# Translated from the reference implementation
+# at https://github.com/veorq/SipHash
+
+import cython
+cimport numpy as cnp
+import numpy as np
+from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
+
+from cpython cimport (PyString_Check,
+                      PyBytes_Check,
+                      PyUnicode_Check)
+from libc.stdlib cimport malloc, free
+
+cdef extern from "stdlib.h":
+    void memcpy(void *dst, void *src, size_t n)
+
+DEF cROUNDS = 2
+DEF dROUNDS = 4
+
+
+def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
+    """
+    Parameters
+    ----------
+    arr : 1-d object ndarray of objects
+    key : hash key, must be 16 byte len encoded
+    encoding : encoding for key & arr, default to 'utf8'
+
+    Returns
+    -------
+    1-d uint64 ndarray of hashes
+
+    """
+    cdef:
+        Py_ssize_t i, l, n
+        ndarray[uint64_t] result
+        bytes data, k
+        uint8_t *kb, *lens
+        char **vecs, *cdata
+        object val
+
+    k = <bytes>key.encode(encoding)
+    kb = <uint8_t *>k
+    if len(k) != 16:
+        raise ValueError(
+            'key should be a 16-byte string encoded, got {!r} (len {})'.format(
+                k, len(k)))
+
+    n = len(arr)
+
+    # create an array of bytes
+    vecs = <char **> malloc(n * sizeof(char *))
+    lens = <uint8_t*> malloc(n * sizeof(uint8_t))
+
+    for i in range(n):
+        val = arr[i]
+        if PyString_Check(val):
+            data = <bytes>val.encode(encoding)
+        elif PyBytes_Check(val):
+            data = <bytes>val
+        elif PyUnicode_Check(val):
+            data = <bytes>val.encode(encoding)
+        else:
+            # non-strings
+            data = <bytes>str(val).encode(encoding)
+
+        l = len(data)
+        lens[i] = l
+        vecs[i] = <char *> malloc(l * sizeof(char))
+        cdata = data
+        memcpy(vecs[i], cdata, l)
+
+    result = np.empty(n, dtype=np.uint64)
+    with nogil:
+        for i in range(n):
+            result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
+            free(vecs[i])
+
+    free(vecs)
+    free(lens)
+    return result
+
+cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
+    return (x << b) | (x >> (64 - b))
+
+cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
+    p[0] = <uint8_t>(v)
+    p[1] = <uint8_t>(v >> 8)
+    p[2] = <uint8_t>(v >> 16)
+    p[3] = <uint8_t>(v >> 24)
+
+cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
+    u32to8_le(p, <uint32_t>v)
+    u32to8_le(p + 4, <uint32_t>(v >> 32))
+
+cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
+    return (<uint64_t>p[0] |
+            <uint64_t>p[1] <<  8 |
+            <uint64_t>p[2] << 16 |
+            <uint64_t>p[3] << 24 |
+            <uint64_t>p[4] << 32 |
+            <uint64_t>p[5] << 40 |
+            <uint64_t>p[6] << 48 |
+            <uint64_t>p[7] << 56)
+
+cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
+                           uint64_t* v2, uint64_t* v3) nogil:
+    v0[0] += v1[0]
+    v1[0] = _rotl(v1[0], 13)
+    v1[0] ^= v0[0]
+    v0[0] = _rotl(v0[0], 32)
+    v2[0] += v3[0]
+    v3[0] = _rotl(v3[0], 16)
+    v3[0] ^= v2[0]
+    v0[0] += v3[0]
+    v3[0] = _rotl(v3[0], 21)
+    v3[0] ^= v0[0]
+    v2[0] += v1[0]
+    v1[0] = _rotl(v1[0], 17)
+    v1[0] ^= v2[0]
+    v2[0] = _rotl(v2[0], 32)
+
+cpdef uint64_t siphash(bytes data, bytes key) except? 0:
+    if len(key) != 16:
+        raise ValueError(
+            'key should be a 16-byte bytestring, got {!r} (len {})'.format(
+                key, len(key)))
+    return low_level_siphash(data, len(data), key)
+
+
+@cython.cdivision(True)
+cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
+                                uint8_t* key) nogil:
+    cdef uint64_t v0 = 0x736f6d6570736575ULL
+    cdef uint64_t v1 = 0x646f72616e646f6dULL
+    cdef uint64_t v2 = 0x6c7967656e657261ULL
+    cdef uint64_t v3 = 0x7465646279746573ULL
+    cdef uint64_t b
+    cdef uint64_t k0 = u8to64_le(key)
+    cdef uint64_t k1 = u8to64_le(key + 8)
+    cdef uint64_t m
+    cdef int i
+    cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
+    cdef int left = datalen & 7
+    cdef int left_byte
+
+    b = (<uint64_t>datalen) << 56
+    v3 ^= k1
+    v2 ^= k0
+    v1 ^= k1
+    v0 ^= k0
+
+    while (data != end):
+        m = u8to64_le(data)
+        v3 ^= m
+        for i in range(cROUNDS):
+            _sipround(&v0, &v1, &v2, &v3)
+        v0 ^= m
+
+        data += sizeof(uint64_t)
+
+    for i in range(left-1, -1, -1):
+        b |= (<uint64_t>data[i]) << (i * 8)
+
+    v3 ^= b
+
+    for i in range(cROUNDS):
+        _sipround(&v0, &v1, &v2, &v3)
+
+    v0 ^= b
+    v2 ^= 0xff
+
+    for i in range(dROUNDS):
+        _sipround(&v0, &v1, &v2, &v3)
+
+    b = v0 ^ v1 ^ v2 ^ v3
+
+    return b