never executed always true always false
1 -- |
2 --
3 -- A fast, space efficient Bloom filter implementation. A Bloom
4 -- filter is a set-like data structure that provides a probabilistic
5 -- membership test.
6 --
7 -- * Queries do not give false negatives. When an element is added to
8 -- a filter, a subsequent membership test will definitely return
9 -- 'True'.
10 --
11 -- * False positives /are/ possible. If an element has not been added
12 -- to a filter, a membership test /may/ nevertheless indicate that
13 -- the element is present.
14 --
15 -- This module provides low-level control. For an easier to use
16 -- interface, see the "Data.BloomFilter.Easy" module.
17
18 module Data.BloomFilter (
19 -- * Overview
20 -- $overview
21
22 -- ** Ease of use
23 -- $ease
24
25 -- ** Performance
26 -- $performance
27
28 -- ** Differences from bloomfilter package
29 -- $differences
30
31 -- * Types
32 Hash,
33 Bloom,
34 MBloom,
35 Bloom',
36 MBloom',
37 CheapHashes,
38 RealHashes,
39
40 -- * Immutable Bloom filters
41
42 -- ** Conversion
43 freeze,
44 thaw,
45 unsafeFreeze,
46
47 -- ** Creation
48 unfold,
49
50 fromList,
51 empty,
52 singleton,
53
54 -- ** Accessors
55 length,
56 elem,
57 elemHashes,
58 notElem,
59 ) where
60
61 import Control.Exception (assert)
62 import Control.Monad (forM_, liftM)
63 import Control.Monad.ST (ST, runST)
64 import Data.BloomFilter.Hash (CheapHashes, Hash, Hashable,
65 Hashes (..), RealHashes)
66 import Data.BloomFilter.Internal (Bloom' (..), bloomInvariant)
67 import Data.BloomFilter.Mutable (MBloom, MBloom', insert, new)
68 import qualified Data.BloomFilter.Mutable.Internal as MB
69 import Data.Word (Word64)
70
71 import Prelude hiding (elem, length, notElem)
72
73 import qualified Data.BloomFilter.BitVec64 as V
74
75 -- | Bloom filter using 'CheapHashes' hashing scheme.
76 type Bloom = Bloom' CheapHashes
77
78 -- | Create an immutable Bloom filter, using the given setup function
79 -- which executes in the 'ST' monad.
80 --
81 -- Example:
82 --
83 -- @
84 -- TODO
85 --import "Data.BloomFilter.Hash" (cheapHashes)
86 --
87 --filter = create (cheapHashes 3) 1024 $ \mf -> do
88 -- insertMB mf \"foo\"
89 -- insertMB mf \"bar\"
90 -- @
91 --
92 -- Note that the result of the setup function is not used.
93 create :: Int -- ^ number of hash functions to use
94 -> Word64 -- ^ number of bits in filter
95 -> (forall s. (MBloom' s h a -> ST s ())) -- ^ setup function
96 -> Bloom' h a
97 {-# INLINE create #-}
98 create hash numBits body = runST $ do
99 mb <- new hash numBits
100 body mb
101 unsafeFreeze mb
102
103 -- | Create an immutable Bloom filter from a mutable one. The mutable
104 -- filter may be modified afterwards.
105 freeze :: MBloom' s h a -> ST s (Bloom' h a)
106 freeze mb = do
107 ba <- V.freeze (MB.bitArray mb)
108 let !bf = Bloom (MB.hashesN mb) (MB.size mb) ba
109 assert (bloomInvariant bf) $ pure bf
110
111 -- | Create an immutable Bloom filter from a mutable one. The mutable
112 -- filter /must not/ be modified afterwards, or a runtime crash may
113 -- occur. For a safer creation interface, use 'freeze' or 'create'.
114 unsafeFreeze :: MBloom' s h a -> ST s (Bloom' h a)
115 unsafeFreeze mb = do
116 ba <- V.unsafeFreeze (MB.bitArray mb)
117 let !bf = Bloom (MB.hashesN mb) (MB.size mb) ba
118 assert (bloomInvariant bf) $ pure bf
119
120 -- | Copy an immutable Bloom filter to create a mutable one. There is
121 -- no non-copying equivalent.
122 thaw :: Bloom' h a -> ST s (MBloom' s h a)
123 thaw ub = MB.MBloom (hashesN ub) (size ub) `liftM` V.thaw (bitArray ub)
124
125 -- | Create an empty Bloom filter.
126 empty :: Int -- ^ number of hash functions to use
127 -> Word64 -- ^ number of bits in filter
128 -> Bloom' h a
129 {-# INLINE [1] empty #-}
130 empty hash numBits = create hash numBits (\_ -> return ())
131
132 -- | Create a Bloom filter with a single element.
133 singleton :: (Hashes h, Hashable a)
134 => Int -- ^ number of hash functions to use
135 -> Word64 -- ^ number of bits in filter
136 -> a -- ^ element to insert
137 -> Bloom' h a
138 singleton hash numBits elt = create hash numBits (\mb -> insert mb elt)
139
140 -- | Query an immutable Bloom filter for membership. If the value is
141 -- present, return @True@. If the value is not present, there is
142 -- /still/ some possibility that @True@ will be returned.
143 elem :: (Hashes h, Hashable a) => a -> Bloom' h a -> Bool
144 elem elt ub = elemHashes (makeHashes elt) ub
145 {-# SPECIALIZE elem :: Hashable a => a -> Bloom a -> Bool #-}
146
147 -- | Query an immutable Bloom filter for membership using already constructed 'Hashes' value.
148 elemHashes :: Hashes h => h a -> Bloom' h a -> Bool
149 elemHashes !ch !ub = go 0 where
150 go :: Int -> Bool
151 go !i | i >= hashesN ub
152 = True
153 go !i = let idx' :: Word64
154 !idx' = evalHashes ch i in
155 let idx :: Int
156 !idx = fromIntegral (idx' `V.unsafeRemWord64` size ub) in
157 -- While the idx' can cover the full Word64 range,
158 -- after taking the remainder, it now must fit in
159 -- and Int because it's less than the filter size.
160 if V.unsafeIndex (bitArray ub) idx
161 then go (i + 1)
162 else False
163 {-# SPECIALIZE elemHashes :: CheapHashes a -> Bloom a -> Bool #-}
164
165 -- | Query an immutable Bloom filter for non-membership. If the value
166 -- /is/ present, return @False@. If the value is not present, there
167 -- is /still/ some possibility that @False@ will be returned.
168 notElem :: (Hashes h, Hashable a) => a -> Bloom' h a -> Bool
169 notElem elt ub = notElemHashes (makeHashes elt) ub
170
171 -- | Query an immutable Bloom filter for non-membership using already constructed 'Hashes' value.
172 notElemHashes :: Hashes h => h a -> Bloom' h a -> Bool
173 notElemHashes !ch !ub = not (elemHashes ch ub)
174
175 -- | Return the size of an immutable Bloom filter, in bits.
176 length :: Bloom' h a -> Word64
177 length = size
178
179 -- | Build an immutable Bloom filter from a seed value. The seeding
180 -- function populates the filter as follows.
181 --
182 -- * If it returns 'Nothing', it is finished producing values to
183 -- insert into the filter.
184 --
185 -- * If it returns @'Just' (a,b)@, @a@ is added to the filter and
186 -- @b@ is used as a new seed.
187 unfold :: forall a b h. (Hashes h, Hashable a)
188 => Int -- ^ number of hash functions to use
189 -> Word64 -- ^ number of bits in filter
190 -> (b -> Maybe (a, b)) -- ^ seeding function
191 -> b -- ^ initial seed
192 -> Bloom' h a
193 {-# INLINE unfold #-}
194 unfold hs numBits f k = create hs numBits (loop k)
195 where loop :: forall s. b -> MBloom' s h a -> ST s ()
196 loop j mb = case f j of
197 Just (a, j') -> insert mb a >> loop j' mb
198 _ -> return ()
199
200 -- | Create an immutable Bloom filter, populating it from a list of
201 -- values.
202 --
203 -- Here is an example that uses the @cheapHashes@ function from the
204 -- "Data.BloomFilter.Hash" module to create a hash function that
205 -- returns three hashes.
206 --
207 -- @
208 -- filt = fromList 3 1024 [\"foo\", \"bar\", \"quux\"]
209 -- @
210 fromList :: (Hashes h, Hashable a)
211 => Int -- ^ number of hash functions to use
212 -> Word64 -- ^ number of bits in filter
213 -> [a] -- ^ values to populate with
214 -> Bloom' h a
215 fromList hs numBits list = create hs numBits $ forM_ list . insert
216
217 -- $overview
218 --
219 -- Each of the functions for creating Bloom filters accepts two parameters:
220 --
221 -- * The number of bits that should be used for the filter. Note that
222 -- a filter is fixed in size; it cannot be resized after creation.
223 --
224 -- * A number of hash functions, /k/, to be used for the filter.
225 --
226 -- By choosing these parameters with care, it is possible to tune for
227 -- a particular false positive rate.
228 -- The 'Data.BloomFilter.Easy.suggestSizing' function in
229 -- the "Data.BloomFilter.Easy" module calculates useful estimates for
230 -- these parameters.
231
232 -- $ease
233 --
234 -- This module provides immutable interfaces for working with a
235 -- query-only Bloom filter, and for converting to and from mutable
236 -- Bloom filters.
237 --
238 -- For a higher-level interface that is easy to use, see the
239 -- "Data.BloomFilter.Easy" module.
240
241 -- $performance
242 --
243 -- The implementation has been carefully tuned for high performance
244 -- and low space consumption.
245
246 -- $differences
247 --
248 -- This package is (almost entirely rewritten) fork of
249 -- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package.
250 --
251 -- The main differences are
252 --
253 -- * This packages support bloomfilters of arbitrary sizes
254 -- (not limited to powers of two). Also sizes over 2^32 are supported.
255 --
256 -- * The 'Bloom' and 'MBloom' types are parametrised over 'Hashes' variable,
257 -- instead of having a @a -> ['Hash']@ typed field.
258 -- This separation allows clean de/serialization of Bloom filters in this
259 -- package, as the hashing scheme is a static.
260 --
261 -- * [XXH3 hash](https://xxhash.com/) is used instead of Jenkins'
262 -- lookup3.