never executed always true always false
    1 {-# LANGUAGE MagicHash     #-}
    2 {-# LANGUAGE UnboxedTuples #-}
    3 -- |
    4 --
    5 -- Fast hashing of Haskell values.
    6 -- The hash used is XXH3 64bit.
    7 --
    8 module Data.BloomFilter.Hash (
    9     -- * Basic hash functionality
   10     Hash,
   11     Hashable(..),
   12     hash64,
   13     hashByteArray,
   14     -- * Incremental hashing
   15     Incremental (..),
   16     HashState,
   17     incrementalHash,
   18     -- * Hashing
   19     Hashes (..),
   20     RealHashes (..),
   21     -- * Compute a family of hash values
   22     CheapHashes (..),
   23     evalCheapHashes,
   24     makeCheapHashes,
   25 ) where
   26 
   27 import           Control.Monad (forM_)
   28 import           Control.Monad.ST (ST, runST)
   29 import           Data.Array.Byte (ByteArray (..))
   30 import           Data.Bits (unsafeShiftR)
   31 import qualified Data.ByteString as BS
   32 import qualified Data.ByteString.Lazy as LBS
   33 import           Data.Char (ord)
   34 import qualified Data.Primitive.ByteArray as P
   35 import           Data.Primitive.Types (Prim (..))
   36 import           Data.Word (Word32, Word64)
   37 import           GHC.Exts (Int#, uncheckedIShiftL#, (+#))
   38 import qualified XXH3
   39 
   40 -- | A hash value is 64 bits wide.
   41 type Hash = Word64
   42 
   43 -------------------------------------------------------------------------------
   44 -- One shot hashing
   45 -------------------------------------------------------------------------------
   46 
   47 -- | The class of types that can be converted to a hash value.
   48 --
   49 -- The instances are meant to be stable, the hash values can be persisted.
   50 --
   51 class Hashable a where
   52     -- | Compute a 64-bit hash of a value.
   53     hashSalt64 ::
   54            Word64  -- ^ seed
   55         -> a       -- ^ value to hash
   56         -> Word64
   57 
   58 -- | Compute a 64-bit hash.
   59 hash64 :: Hashable a => a -> Word64
   60 hash64 = hashSalt64 0
   61 
   62 instance Hashable () where
   63     hashSalt64 salt _ = salt
   64 
   65 instance Hashable Char where
   66     -- Char's ordinal value should fit into Word32
   67     hashSalt64 salt c = hashSalt64 salt (fromIntegral (ord c) :: Word32)
   68 
   69 instance Hashable BS.ByteString where
   70     hashSalt64 salt bs = XXH3.xxh3_64bit_withSeed_bs bs salt
   71 
   72 instance Hashable LBS.ByteString where
   73     hashSalt64 salt lbs =
   74         incrementalHash salt $ \s ->
   75         forM_ (LBS.toChunks lbs) $ \bs ->
   76         update s bs
   77 
   78 instance Hashable ByteArray where
   79     hashSalt64 salt ba = XXH3.xxh3_64bit_withSeed_ba ba 0 (P.sizeofByteArray ba) salt
   80 
   81 instance Hashable Word64 where
   82     hashSalt64 salt w = XXH3.xxh3_64bit_withSeed_w64 w salt
   83 
   84 instance Hashable Word32 where
   85     hashSalt64 salt w = XXH3.xxh3_64bit_withSeed_w32 w salt
   86 
   87 {- Note [Tree hashing]
   88 
   89 We recursively hash inductive types (instead e.g. just serially hashing
   90 their fields). Why?
   91 
   92 So ("", "x") and ("x", "") or [[],[],[""]], [[],[""],[]] and [[""],[],[]]
   93 have different hash values!
   94 
   95 Another approach would be to have injective serialisation,
   96 but then 'Incremental BS.ByteString' instance (e.g.) would need to serialise
   97 the length, so we'd need third class for "pieces", keeping 'Incremental'
   98 just adding bytes to the state (without any extras).
   99 
  100 -}
  101 
  102 instance Hashable a => Hashable [a] where
  103     hashSalt64 salt xs = incrementalHash salt $ \s -> forM_ xs $ \x ->
  104         update s (hash64 x)
  105 
  106 instance (Hashable a, Hashable b) => Hashable (a, b) where
  107     hashSalt64 salt (x, y) = incrementalHash salt $ \s -> do
  108         update s (hash64 x)
  109         update s (hash64 y)
  110 
  111 -- | Hash a (part of) 'ByteArray'.
  112 hashByteArray :: ByteArray -> Int -> Int -> Word64 -> Word64
  113 hashByteArray = XXH3.xxh3_64bit_withSeed_ba
  114 
  115 -------------------------------------------------------------------------------
  116 -- Incremental hashing
  117 -------------------------------------------------------------------------------
  118 
  119 -- | Hash state for incremental hashing
  120 newtype HashState s = HashState (XXH3.XXH3_State s)
  121 
  122 -- | The class of types that can be incrementally hashed.
  123 class Incremental a where
  124     update :: HashState s -> a -> ST s ()
  125 
  126 instance Incremental BS.ByteString where
  127     update (HashState s) = XXH3.xxh3_64bit_update_bs s
  128 
  129 instance Incremental Word32 where
  130     update (HashState s) = XXH3.xxh3_64bit_update_w32 s
  131 
  132 instance Incremental Word64 where
  133     update (HashState s) = XXH3.xxh3_64bit_update_w64 s
  134 
  135 instance Incremental Char where
  136     update s c = update s (fromIntegral (ord c) :: Word32)
  137 
  138 -- | Calculate incrementally constructed hash.
  139 incrementalHash :: Word64 -> (forall s. HashState s -> ST s ()) -> Word64
  140 incrementalHash seed f = runST $ do
  141     s <- XXH3.xxh3_64bit_createState
  142     XXH3.xxh3_64bit_reset_withSeed s seed
  143     f (HashState s)
  144     XXH3.xxh3_64bit_digest s
  145 
  146 -------------------------------------------------------------------------------
  147 -- Hashes
  148 -------------------------------------------------------------------------------
  149 
  150 -- | A type class abstracting over different hashing schemes.b
  151 class Hashes h where
  152     makeHashes :: Hashable a => a -> h a
  153 
  154     evalHashes :: h a -> Int -> Hash
  155 
  156 -- | A closure of real hashing function.
  157 newtype RealHashes a = RealHashes (Word64 -> Hash)
  158 
  159 instance Hashes RealHashes where
  160     makeHashes x = RealHashes (\salt -> hashSalt64 salt x)
  161     evalHashes (RealHashes f) i = f (fromIntegral i)
  162 
  163 -------------------------------------------------------------------------------
  164 -- CheapHashes
  165 -------------------------------------------------------------------------------
  166 
  167 -- | A pair of hashes used for a double hashing scheme.
  168 --
  169 -- See 'evalCheapHashes'.
  170 data CheapHashes a = CheapHashes !Hash !Hash
  171   deriving Show
  172 type role CheapHashes nominal
  173 
  174 instance Hashes CheapHashes where
  175     makeHashes = makeCheapHashes
  176     {-# INLINE makeHashes #-}
  177     evalHashes = evalCheapHashes
  178     {-# INLINE evalHashes #-}
  179 
  180 instance Prim (CheapHashes a) where
  181     sizeOfType# _ = 16#
  182     alignmentOfType# _ = 8#
  183 
  184     indexByteArray# ba i = CheapHashes
  185         (indexByteArray# ba (indexLo i))
  186         (indexByteArray# ba (indexHi i))
  187     readByteArray# ba i s1 =
  188         case readByteArray# ba (indexLo i) s1 of { (# s2, lo #) ->
  189         case readByteArray# ba (indexHi i) s2 of { (# s3, hi #) ->
  190         (# s3, CheapHashes lo hi #)
  191         }}
  192     writeByteArray# ba i (CheapHashes lo hi) s =
  193         writeByteArray# ba (indexHi i) hi (writeByteArray# ba (indexLo i) lo s)
  194 
  195     indexOffAddr# ba i = CheapHashes
  196         (indexOffAddr# ba (indexLo i))
  197         (indexOffAddr# ba (indexHi i))
  198     readOffAddr# ba i s1 =
  199         case readOffAddr# ba (indexLo i) s1 of { (# s2, lo #) ->
  200         case readOffAddr# ba (indexHi i) s2 of { (# s3, hi #) ->
  201         (# s3, CheapHashes lo hi #)
  202         }}
  203     writeOffAddr# ba i (CheapHashes lo hi) s =
  204         writeOffAddr# ba (indexHi i) hi (writeOffAddr# ba (indexLo i) lo s)
  205 
  206 indexLo :: Int# -> Int#
  207 indexLo i = uncheckedIShiftL# i 1#
  208 
  209 indexHi :: Int# -> Int#
  210 indexHi i = uncheckedIShiftL# i 1# +# 1#
  211 
  212 {- Note [Original CheapHashes]
  213 
  214 Compute a list of 32-bit hashes relatively cheaply.  The value to
  215 hash is inspected at most twice, regardless of the number of hashes
  216 requested.
  217 
  218 We use a variant of Kirsch and Mitzenmacher's technique from \"Less
  219 Hashing, Same Performance: Building a Better Bloom Filter\",
  220 <http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf>.
  221 
  222 Where Kirsch and Mitzenmacher multiply the second hash by a
  223 coefficient, we shift right by the coefficient.  This offers better
  224 performance (as a shift is much cheaper than a multiply), and the
  225 low order bits of the final hash stay well mixed.
  226 
  227 -}
  228 
  229 {- Note: [CheapHashes]
  230 
  231 On the first glance the 'evalCheapHashes' scheme seems dubious.
  232 
  233 Firstly, it's original performance motivation is dubious.
  234 
  235 > multiply the second hash by a coefficient
  236 
  237 While the scheme double hashing scheme is presented in
  238 theoretical analysis as
  239 
  240     g(i) = a + i * b
  241 
  242 In practice it's implemented in a loop which looks like
  243 
  244     g[0] = a
  245     for (i = 1; i < k; i++) {
  246         a += b;
  247         g[i] = a;
  248     }
  249 
  250 I.e. with just an addition.
  251 
  252 Secondly there is no analysis anywhere about the
  253 'evalCheapHashes' scheme.
  254 
  255 Peter Dillinger's thesis (Adaptive Approximate State Storage)
  256 discusses various fast hashing schemes (section 6.5),
  257 mentioning why ordinary "double hashing" is weak scheme.
  258 
  259 Issue 1: when second hash value is bad, e.g. not coprime with bloom filters size in bits,
  260 we can get repetitions (worst case 0, or m/2).
  261 
  262 Issue 2: in bloom filter scenario, whether we do a + i * b or h0 - i * b' (with b' = -b)
  263 as we probe all indices (as set) doesn't matter, not sequentially (like in hash table).
  264 So we lose one bit entropy.
  265 
  266 Issue 3: the scheme is prone to partial overlap.
  267 Two values with the same second hash value could overlap on many indices.
  268 
  269 Then Dillinger discusses various schemes which solve this issue.
  270 
  271 The CheapHashes scheme seems to avoid these cuprits.
  272 This is probably because it uses most of the bits of the second hash, even in m = 2^n scenarios.
  273 (normal double hashing and enhances double hashing don't use the high bits or original hash then).
  274 TL;DR CheapHashes seems to work well in practice.
  275 
  276 For the record: RocksDB uses an own scheme as well,
  277 where first hash is used to pick a cache line, and second one to generate probes inside it.
  278 https://github.com/facebook/rocksdb/blob/096fb9b67d19a9a180e7c906b4a0cdb2b2d0c1f6/util/bloom_impl.h
  279 
  280 -}
  281 
  282 -- | Evalute 'CheapHashes' family.
  283 --
  284 -- \[
  285 -- g_i = h_0 + \left\lfloor h_1 / 2^i \right\rfloor
  286 -- \]
  287 --
  288 evalCheapHashes :: CheapHashes a -> Int -> Hash
  289 evalCheapHashes (CheapHashes h1 h2) i = h1 + (h2 `unsafeShiftR` i)
  290 
  291 -- | Create 'CheapHashes' structure.
  292 --
  293 -- It's simply hashes the value twice using seed 0 and 1.
  294 makeCheapHashes :: Hashable a => a -> CheapHashes a
  295 makeCheapHashes v = CheapHashes (hashSalt64 0 v) (hashSalt64 1 v)
  296 {-# SPECIALIZE makeCheapHashes :: BS.ByteString -> CheapHashes BS.ByteString #-}
  297 {-# INLINEABLE makeCheapHashes #-}