never executed always true always false
1 {-# LANGUAGE MagicHash #-}
2 {-# LANGUAGE UnboxedTuples #-}
3 -- |
4 --
5 -- Fast hashing of Haskell values.
6 -- The hash used is XXH3 64bit.
7 --
8 module Data.BloomFilter.Hash (
9 -- * Basic hash functionality
10 Hash,
11 Hashable(..),
12 hash64,
13 hashByteArray,
14 -- * Incremental hashing
15 Incremental (..),
16 HashState,
17 incrementalHash,
18 -- * Hashing
19 Hashes (..),
20 RealHashes (..),
21 -- * Compute a family of hash values
22 CheapHashes (..),
23 evalCheapHashes,
24 makeCheapHashes,
25 ) where
26
27 import Control.Monad (forM_)
28 import Control.Monad.ST (ST, runST)
29 import Data.Array.Byte (ByteArray (..))
30 import Data.Bits (unsafeShiftR)
31 import qualified Data.ByteString as BS
32 import qualified Data.ByteString.Lazy as LBS
33 import Data.Char (ord)
34 import qualified Data.Primitive.ByteArray as P
35 import Data.Primitive.Types (Prim (..))
36 import Data.Word (Word32, Word64)
37 import GHC.Exts (Int#, uncheckedIShiftL#, (+#))
38 import qualified XXH3
39
40 -- | A hash value is 64 bits wide.
41 type Hash = Word64
42
43 -------------------------------------------------------------------------------
44 -- One shot hashing
45 -------------------------------------------------------------------------------
46
47 -- | The class of types that can be converted to a hash value.
48 --
49 -- The instances are meant to be stable, the hash values can be persisted.
50 --
51 class Hashable a where
52 -- | Compute a 64-bit hash of a value.
53 hashSalt64 ::
54 Word64 -- ^ seed
55 -> a -- ^ value to hash
56 -> Word64
57
58 -- | Compute a 64-bit hash.
59 hash64 :: Hashable a => a -> Word64
60 hash64 = hashSalt64 0
61
62 instance Hashable () where
63 hashSalt64 salt _ = salt
64
65 instance Hashable Char where
66 -- Char's ordinal value should fit into Word32
67 hashSalt64 salt c = hashSalt64 salt (fromIntegral (ord c) :: Word32)
68
69 instance Hashable BS.ByteString where
70 hashSalt64 salt bs = XXH3.xxh3_64bit_withSeed_bs bs salt
71
72 instance Hashable LBS.ByteString where
73 hashSalt64 salt lbs =
74 incrementalHash salt $ \s ->
75 forM_ (LBS.toChunks lbs) $ \bs ->
76 update s bs
77
78 instance Hashable ByteArray where
79 hashSalt64 salt ba = XXH3.xxh3_64bit_withSeed_ba ba 0 (P.sizeofByteArray ba) salt
80
81 instance Hashable Word64 where
82 hashSalt64 salt w = XXH3.xxh3_64bit_withSeed_w64 w salt
83
84 instance Hashable Word32 where
85 hashSalt64 salt w = XXH3.xxh3_64bit_withSeed_w32 w salt
86
87 {- Note [Tree hashing]
88
89 We recursively hash inductive types (instead e.g. just serially hashing
90 their fields). Why?
91
92 So ("", "x") and ("x", "") or [[],[],[""]], [[],[""],[]] and [[""],[],[]]
93 have different hash values!
94
95 Another approach would be to have injective serialisation,
96 but then 'Incremental BS.ByteString' instance (e.g.) would need to serialise
97 the length, so we'd need third class for "pieces", keeping 'Incremental'
98 just adding bytes to the state (without any extras).
99
100 -}
101
102 instance Hashable a => Hashable [a] where
103 hashSalt64 salt xs = incrementalHash salt $ \s -> forM_ xs $ \x ->
104 update s (hash64 x)
105
106 instance (Hashable a, Hashable b) => Hashable (a, b) where
107 hashSalt64 salt (x, y) = incrementalHash salt $ \s -> do
108 update s (hash64 x)
109 update s (hash64 y)
110
111 -- | Hash a (part of) 'ByteArray'.
112 hashByteArray :: ByteArray -> Int -> Int -> Word64 -> Word64
113 hashByteArray = XXH3.xxh3_64bit_withSeed_ba
114
115 -------------------------------------------------------------------------------
116 -- Incremental hashing
117 -------------------------------------------------------------------------------
118
119 -- | Hash state for incremental hashing
120 newtype HashState s = HashState (XXH3.XXH3_State s)
121
122 -- | The class of types that can be incrementally hashed.
123 class Incremental a where
124 update :: HashState s -> a -> ST s ()
125
126 instance Incremental BS.ByteString where
127 update (HashState s) = XXH3.xxh3_64bit_update_bs s
128
129 instance Incremental Word32 where
130 update (HashState s) = XXH3.xxh3_64bit_update_w32 s
131
132 instance Incremental Word64 where
133 update (HashState s) = XXH3.xxh3_64bit_update_w64 s
134
135 instance Incremental Char where
136 update s c = update s (fromIntegral (ord c) :: Word32)
137
138 -- | Calculate incrementally constructed hash.
139 incrementalHash :: Word64 -> (forall s. HashState s -> ST s ()) -> Word64
140 incrementalHash seed f = runST $ do
141 s <- XXH3.xxh3_64bit_createState
142 XXH3.xxh3_64bit_reset_withSeed s seed
143 f (HashState s)
144 XXH3.xxh3_64bit_digest s
145
146 -------------------------------------------------------------------------------
147 -- Hashes
148 -------------------------------------------------------------------------------
149
150 -- | A type class abstracting over different hashing schemes.b
151 class Hashes h where
152 makeHashes :: Hashable a => a -> h a
153
154 evalHashes :: h a -> Int -> Hash
155
156 -- | A closure of real hashing function.
157 newtype RealHashes a = RealHashes (Word64 -> Hash)
158
159 instance Hashes RealHashes where
160 makeHashes x = RealHashes (\salt -> hashSalt64 salt x)
161 evalHashes (RealHashes f) i = f (fromIntegral i)
162
163 -------------------------------------------------------------------------------
164 -- CheapHashes
165 -------------------------------------------------------------------------------
166
167 -- | A pair of hashes used for a double hashing scheme.
168 --
169 -- See 'evalCheapHashes'.
170 data CheapHashes a = CheapHashes !Hash !Hash
171 deriving Show
172 type role CheapHashes nominal
173
174 instance Hashes CheapHashes where
175 makeHashes = makeCheapHashes
176 {-# INLINE makeHashes #-}
177 evalHashes = evalCheapHashes
178 {-# INLINE evalHashes #-}
179
180 instance Prim (CheapHashes a) where
181 sizeOfType# _ = 16#
182 alignmentOfType# _ = 8#
183
184 indexByteArray# ba i = CheapHashes
185 (indexByteArray# ba (indexLo i))
186 (indexByteArray# ba (indexHi i))
187 readByteArray# ba i s1 =
188 case readByteArray# ba (indexLo i) s1 of { (# s2, lo #) ->
189 case readByteArray# ba (indexHi i) s2 of { (# s3, hi #) ->
190 (# s3, CheapHashes lo hi #)
191 }}
192 writeByteArray# ba i (CheapHashes lo hi) s =
193 writeByteArray# ba (indexHi i) hi (writeByteArray# ba (indexLo i) lo s)
194
195 indexOffAddr# ba i = CheapHashes
196 (indexOffAddr# ba (indexLo i))
197 (indexOffAddr# ba (indexHi i))
198 readOffAddr# ba i s1 =
199 case readOffAddr# ba (indexLo i) s1 of { (# s2, lo #) ->
200 case readOffAddr# ba (indexHi i) s2 of { (# s3, hi #) ->
201 (# s3, CheapHashes lo hi #)
202 }}
203 writeOffAddr# ba i (CheapHashes lo hi) s =
204 writeOffAddr# ba (indexHi i) hi (writeOffAddr# ba (indexLo i) lo s)
205
206 indexLo :: Int# -> Int#
207 indexLo i = uncheckedIShiftL# i 1#
208
209 indexHi :: Int# -> Int#
210 indexHi i = uncheckedIShiftL# i 1# +# 1#
211
212 {- Note [Original CheapHashes]
213
214 Compute a list of 32-bit hashes relatively cheaply. The value to
215 hash is inspected at most twice, regardless of the number of hashes
216 requested.
217
218 We use a variant of Kirsch and Mitzenmacher's technique from \"Less
219 Hashing, Same Performance: Building a Better Bloom Filter\",
220 <http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf>.
221
222 Where Kirsch and Mitzenmacher multiply the second hash by a
223 coefficient, we shift right by the coefficient. This offers better
224 performance (as a shift is much cheaper than a multiply), and the
225 low order bits of the final hash stay well mixed.
226
227 -}
228
229 {- Note: [CheapHashes]
230
231 On the first glance the 'evalCheapHashes' scheme seems dubious.
232
233 Firstly, it's original performance motivation is dubious.
234
235 > multiply the second hash by a coefficient
236
237 While the scheme double hashing scheme is presented in
238 theoretical analysis as
239
240 g(i) = a + i * b
241
242 In practice it's implemented in a loop which looks like
243
244 g[0] = a
245 for (i = 1; i < k; i++) {
246 a += b;
247 g[i] = a;
248 }
249
250 I.e. with just an addition.
251
252 Secondly there is no analysis anywhere about the
253 'evalCheapHashes' scheme.
254
255 Peter Dillinger's thesis (Adaptive Approximate State Storage)
256 discusses various fast hashing schemes (section 6.5),
257 mentioning why ordinary "double hashing" is weak scheme.
258
259 Issue 1: when second hash value is bad, e.g. not coprime with bloom filters size in bits,
260 we can get repetitions (worst case 0, or m/2).
261
262 Issue 2: in bloom filter scenario, whether we do a + i * b or h0 - i * b' (with b' = -b)
263 as we probe all indices (as set) doesn't matter, not sequentially (like in hash table).
264 So we lose one bit entropy.
265
266 Issue 3: the scheme is prone to partial overlap.
267 Two values with the same second hash value could overlap on many indices.
268
269 Then Dillinger discusses various schemes which solve this issue.
270
271 The CheapHashes scheme seems to avoid these cuprits.
272 This is probably because it uses most of the bits of the second hash, even in m = 2^n scenarios.
273 (normal double hashing and enhances double hashing don't use the high bits or original hash then).
274 TL;DR CheapHashes seems to work well in practice.
275
276 For the record: RocksDB uses an own scheme as well,
277 where first hash is used to pick a cache line, and second one to generate probes inside it.
278 https://github.com/facebook/rocksdb/blob/096fb9b67d19a9a180e7c906b4a0cdb2b2d0c1f6/util/bloom_impl.h
279
280 -}
281
282 -- | Evalute 'CheapHashes' family.
283 --
284 -- \[
285 -- g_i = h_0 + \left\lfloor h_1 / 2^i \right\rfloor
286 -- \]
287 --
288 evalCheapHashes :: CheapHashes a -> Int -> Hash
289 evalCheapHashes (CheapHashes h1 h2) i = h1 + (h2 `unsafeShiftR` i)
290
291 -- | Create 'CheapHashes' structure.
292 --
293 -- It's simply hashes the value twice using seed 0 and 1.
294 makeCheapHashes :: Hashable a => a -> CheapHashes a
295 makeCheapHashes v = CheapHashes (hashSalt64 0 v) (hashSalt64 1 v)
296 {-# SPECIALIZE makeCheapHashes :: BS.ByteString -> CheapHashes BS.ByteString #-}
297 {-# INLINEABLE makeCheapHashes #-}