never executed always true always false
1 -- | An easy-to-use Bloom filter interface.
2 module Data.BloomFilter.Easy (
3 -- * Easy creation and querying
4 Bloom,
5 easyList,
6 B.elem,
7 B.notElem,
8 B.length,
9
10 -- * Mutable bloom filter
11 MBloom,
12 easyNew,
13 MB.new,
14 MB.insert,
15 B.freeze,
16
17 -- ** Example: a spell checker
18 -- $example
19
20 -- * Useful defaults for creation
21 safeSuggestSizing,
22 suggestSizing,
23 ) where
24
25 import Control.Monad.ST (ST)
26 import Data.BloomFilter (Bloom)
27 import qualified Data.BloomFilter as B
28 import Data.BloomFilter.Calc
29 import Data.BloomFilter.Hash (Hashable)
30 import Data.BloomFilter.Mutable (MBloom)
31 import qualified Data.BloomFilter.Mutable as MB
32 import qualified Data.ByteString as SB
33 import Data.Word (Word64)
34
35 -------------------------------------------------------------------------------
36 -- Easy interface
37 -------------------------------------------------------------------------------
38
39 -- | Create a Bloom filter with the desired false positive rate and
40 -- members. The hash functions used are computed by the @cheapHashes@
41 -- function from the 'Data.BloomFilter.Hash' module.
42 easyList :: Hashable a
43 => Double -- ^ desired false positive rate (0 < /ε/ < 1)
44 -> [a] -- ^ values to populate with
45 -> Bloom a
46 {-# SPECIALIZE easyList :: Double -> [SB.ByteString] -> Bloom SB.ByteString #-}
47 easyList errRate xs = B.fromList numHashes numBits xs
48 where
49 capacity = length xs
50 (numBits, numHashes)
51 | capacity > 0 = suggestSizing capacity errRate
52 | otherwise = (1, 1)
53
54 -- | Create a Bloom filter with the desired false positive rate, /ε/
55 -- and expected maximum size, /n/.
56 easyNew :: Double -- ^ desired false positive rate (0 < /ε/ < 1)
57 -> Int -- ^ expected maximum size, /n/
58 -> ST s (MBloom s a)
59 easyNew errRate capacity = MB.new numHashes numBits
60 where
61 (numBits, numHashes) = suggestSizing capacity errRate
62
63 -------------------------------------------------------------------------------
64 -- Size suggestions
65 -------------------------------------------------------------------------------
66
67 -- | Suggest a good combination of filter size and number of hash
68 -- functions for a Bloom filter, based on its expected maximum
69 -- capacity and a desired false positive rate.
70 --
71 -- The false positive rate is the rate at which queries against the
72 -- filter should return 'True' when an element is not actually
73 -- present. It should be a fraction between 0 and 1, so a 1% false
74 -- positive rate is represented by 0.01.
75 --
76 -- This function will suggest to use a bloom filter of prime size.
77 -- These theoretically behave the best.
78 -- Also it won't suggest to use over 63 hash functions,
79 -- because CheapHashes work only up to 63 functions.
80 --
81 -- Note that while creating bloom filters with extremely small (or
82 -- even negative) capacity is allowed for convenience, it is often
83 -- not very useful.
84 -- This function will always suggest to use at least 61 bits.
85 --
86 -- >>> safeSuggestSizing 10000 0.01
87 -- Right (99317,7)
88 --
89 safeSuggestSizing ::
90 Int -- ^ expected maximum capacity
91 -> Double -- ^ desired false positive rate (0 < /e/ < 1)
92 -> Either String (Word64, Int)
93 safeSuggestSizing (fromIntegral -> capacity) errRate
94 | capacity <= 0 = Right (61, 1)
95 | errRate <= 0 || errRate >= 1 = Left "invalid error rate"
96 | otherwise = pickSize primes
97 where
98 bits :: Double
99 hashes :: Int
100 (bits, hashes) = minimum
101 [ (filterSize capacity errRate k, k')
102 | k' <- [1 .. 63]
103 , let k = fromIntegral k'
104 ]
105
106 pickSize [] = Left "capacity too large"
107 pickSize (w:ws)
108 | fromIntegral w >= bits = Right (w, hashes)
109 | otherwise = pickSize ws
110
111 -- primes from around 2^6 to 2^40, with five primes per "octave",
112 --
113 -- * 61, 73, 83, 97, 109
114 -- * 127, 139, ...
115 -- * 257, 293, ...
116 -- * ...
117 --
118 -- The third next element is around 1.5 times larger:
119 -- 97/63 = 1.59; 109/73 = 1.49; 127/83 = 1.52
120 --
121 -- The approximate growth rate is 1.14.
122 --
123 primes :: [Word64]
124 primes =
125 [61,73,83,97,109,127,139,167,193,223,257,293,337,389,443,509,587,673,773
126 ,887,1021,1171,1327,1553,1783,2039,2351,2699,3089,3559,4093,4703,5399,6203
127 ,7129,8191,9403,10799,12413,14251,16381,18803,21617,24821,28517,32749
128 ,37633,43237,49667,57047,65537,75277,86467,99317,114089,131071,150559
129 ,172933,198659,228203,262139,301123,345889,397337,456409,524287,602233
130 ,691799,794669,912839,1048573,1204493,1383593,1589333,1825673,2097143
131 ,2408993,2767201,3178667,3651341,4194301,4817977,5534413,6357353,7302683
132 ,8388593,9635981,11068817,12714749,14605411,16777213,19271957,22137667
133 ,25429499,29210821,33554393,38543917,44275331,50858999,58421653,67108859
134 ,77087833,88550677,101718013,116843297,134217689,154175663,177101321
135 ,203436029,233686637,268435399,308351357,354202703,406872031,467373223
136 ,536870909,616702721,708405407,813744131,934746541,1073741789,1233405449
137 ,1416810797,1627488229,1869493097,2147483647,2466810893,2833621657
138 ,3254976541,3738986131,4294967291,4933621843,5667243317,6509953069
139 ,7477972391,8589934583,9867243719,11334486629,13019906153,14955944737
140 ,17179869143,19734487471,22668973277,26039812297,29911889569,34359738337
141 ,39468974939,45337946581,52079624657,59823779149,68719476731,78937949837
142 ,90675893137,104159249321,119647558343,137438953447,157875899707
143 ,181351786333,208318498651,239295116717,274877906899,315751799521
144 ,362703572681,416636997289,478590233419,549755813881,631503599063
145 ,725407145383,833273994643,957180466901,1099511627689
146 ]
147
148 -- | Behaves as 'safeSuggestSizing', but calls 'error' if given
149 -- invalid or out-of-range inputs.
150 suggestSizing :: Int -- ^ expected maximum capacity
151 -> Double -- ^ desired false positive rate (0 < /e/ < 1)
152 -> (Word64, Int)
153 suggestSizing cap errs = either fatal id (safeSuggestSizing cap errs)
154 where fatal = error . ("Data.BloomFilter.Util.suggestSizing: " ++)
155
156 -- $example
157 --
158 -- This example reads a dictionary file containing one word per line,
159 -- constructs a Bloom filter with a 1% false positive rate, and
160 -- spellchecks its standard input. Like the Unix @spell@ command, it
161 -- prints each word that it does not recognize.
162 --
163 -- @
164 -- import Data.Maybe (mapMaybe)
165 -- import qualified Data.BloomFilter.Easy as B
166 --
167 -- main = do
168 -- filt \<- B.'easyList' 0.01 . words \<$> readFile "\/usr\/share\/dict\/words"
169 -- let check word | B.'B.elem' word filt = Nothing
170 -- | otherwise = Just word
171 -- interact (unlines . mapMaybe check . lines)
172 -- @