never executed always true always false
    1 -- | An easy-to-use Bloom filter interface.
    2 module Data.BloomFilter.Easy (
    3     -- * Easy creation and querying
    4     Bloom,
    5     easyList,
    6     B.elem,
    7     B.notElem,
    8     B.length,
    9 
   10     -- * Mutable bloom filter
   11     MBloom,
   12     easyNew,
   13     MB.new,
   14     MB.insert,
   15     B.freeze,
   16 
   17     -- ** Example: a spell checker
   18     -- $example
   19 
   20     -- * Useful defaults for creation
   21     safeSuggestSizing,
   22     suggestSizing,
   23 ) where
   24 
   25 import           Control.Monad.ST (ST)
   26 import           Data.BloomFilter (Bloom)
   27 import qualified Data.BloomFilter as B
   28 import           Data.BloomFilter.Calc
   29 import           Data.BloomFilter.Hash (Hashable)
   30 import           Data.BloomFilter.Mutable (MBloom)
   31 import qualified Data.BloomFilter.Mutable as MB
   32 import qualified Data.ByteString as SB
   33 import           Data.Word (Word64)
   34 
   35 -------------------------------------------------------------------------------
   36 -- Easy interface
   37 -------------------------------------------------------------------------------
   38 
   39 -- | Create a Bloom filter with the desired false positive rate and
   40 -- members.  The hash functions used are computed by the @cheapHashes@
   41 -- function from the 'Data.BloomFilter.Hash' module.
   42 easyList :: Hashable a
   43          => Double              -- ^ desired false positive rate (0 < /ε/ < 1)
   44          -> [a]                 -- ^ values to populate with
   45          -> Bloom a
   46 {-# SPECIALIZE easyList :: Double -> [SB.ByteString] -> Bloom SB.ByteString #-}
   47 easyList errRate xs = B.fromList numHashes numBits xs
   48   where
   49     capacity = length xs
   50     (numBits, numHashes)
   51         | capacity > 0 = suggestSizing capacity errRate
   52         | otherwise    = (1, 1)
   53 
   54 -- | Create a Bloom filter with the desired false positive rate, /ε/
   55 -- and expected maximum size, /n/.
   56 easyNew :: Double    -- ^ desired false positive rate (0 < /ε/ < 1)
   57         -> Int       -- ^ expected maximum size, /n/
   58         -> ST s (MBloom s a)
   59 easyNew errRate capacity = MB.new numHashes numBits
   60   where
   61     (numBits, numHashes) = suggestSizing capacity errRate
   62 
   63 -------------------------------------------------------------------------------
   64 -- Size suggestions
   65 -------------------------------------------------------------------------------
   66 
   67 -- | Suggest a good combination of filter size and number of hash
   68 -- functions for a Bloom filter, based on its expected maximum
   69 -- capacity and a desired false positive rate.
   70 --
   71 -- The false positive rate is the rate at which queries against the
   72 -- filter should return 'True' when an element is not actually
   73 -- present.  It should be a fraction between 0 and 1, so a 1% false
   74 -- positive rate is represented by 0.01.
   75 --
   76 -- This function will suggest to use a bloom filter of prime size.
   77 -- These theoretically behave the best.
   78 -- Also it won't suggest to use over 63 hash functions,
   79 -- because CheapHashes work only up to 63 functions.
   80 --
   81 -- Note that while creating bloom filters with extremely small (or
   82 -- even negative) capacity is allowed for convenience, it is often
   83 -- not very useful.
   84 -- This function will always suggest to use at least 61 bits.
   85 --
   86 -- >>> safeSuggestSizing 10000 0.01
   87 -- Right (99317,7)
   88 --
   89 safeSuggestSizing ::
   90        Int              -- ^ expected maximum capacity
   91     -> Double           -- ^ desired false positive rate (0 < /e/ < 1)
   92     -> Either String (Word64, Int)
   93 safeSuggestSizing (fromIntegral -> capacity) errRate
   94     | capacity <= 0                = Right (61, 1)
   95     | errRate <= 0 || errRate >= 1 = Left "invalid error rate"
   96     | otherwise                    = pickSize primes
   97   where
   98     bits   :: Double
   99     hashes :: Int
  100     (bits, hashes) = minimum
  101         [ (filterSize capacity errRate k, k')
  102         | k' <- [1 .. 63]
  103         , let k = fromIntegral k'
  104         ]
  105 
  106     pickSize [] = Left "capacity too large"
  107     pickSize (w:ws)
  108         | fromIntegral w >= bits = Right (w, hashes)
  109         | otherwise              = pickSize ws
  110 
  111 -- primes from around 2^6 to 2^40, with five primes per "octave",
  112 --
  113 -- * 61, 73, 83, 97, 109
  114 -- * 127, 139, ...
  115 -- * 257, 293, ...
  116 -- * ...
  117 --
  118 -- The third next element is around 1.5 times larger:
  119 -- 97/63 = 1.59; 109/73 = 1.49; 127/83 = 1.52
  120 --
  121 -- The approximate growth rate is 1.14.
  122 --
  123 primes :: [Word64]
  124 primes =
  125     [61,73,83,97,109,127,139,167,193,223,257,293,337,389,443,509,587,673,773
  126     ,887,1021,1171,1327,1553,1783,2039,2351,2699,3089,3559,4093,4703,5399,6203
  127     ,7129,8191,9403,10799,12413,14251,16381,18803,21617,24821,28517,32749
  128     ,37633,43237,49667,57047,65537,75277,86467,99317,114089,131071,150559
  129     ,172933,198659,228203,262139,301123,345889,397337,456409,524287,602233
  130     ,691799,794669,912839,1048573,1204493,1383593,1589333,1825673,2097143
  131     ,2408993,2767201,3178667,3651341,4194301,4817977,5534413,6357353,7302683
  132     ,8388593,9635981,11068817,12714749,14605411,16777213,19271957,22137667
  133     ,25429499,29210821,33554393,38543917,44275331,50858999,58421653,67108859
  134     ,77087833,88550677,101718013,116843297,134217689,154175663,177101321
  135     ,203436029,233686637,268435399,308351357,354202703,406872031,467373223
  136     ,536870909,616702721,708405407,813744131,934746541,1073741789,1233405449
  137     ,1416810797,1627488229,1869493097,2147483647,2466810893,2833621657
  138     ,3254976541,3738986131,4294967291,4933621843,5667243317,6509953069
  139     ,7477972391,8589934583,9867243719,11334486629,13019906153,14955944737
  140     ,17179869143,19734487471,22668973277,26039812297,29911889569,34359738337
  141     ,39468974939,45337946581,52079624657,59823779149,68719476731,78937949837
  142     ,90675893137,104159249321,119647558343,137438953447,157875899707
  143     ,181351786333,208318498651,239295116717,274877906899,315751799521
  144     ,362703572681,416636997289,478590233419,549755813881,631503599063
  145     ,725407145383,833273994643,957180466901,1099511627689
  146     ]
  147 
  148 -- | Behaves as 'safeSuggestSizing', but calls 'error' if given
  149 -- invalid or out-of-range inputs.
  150 suggestSizing :: Int            -- ^ expected maximum capacity
  151               -> Double         -- ^ desired false positive rate (0 < /e/ < 1)
  152               -> (Word64, Int)
  153 suggestSizing cap errs = either fatal id (safeSuggestSizing cap errs)
  154   where fatal = error . ("Data.BloomFilter.Util.suggestSizing: " ++)
  155 
  156 -- $example
  157 --
  158 -- This example reads a dictionary file containing one word per line,
  159 -- constructs a Bloom filter with a 1% false positive rate, and
  160 -- spellchecks its standard input.  Like the Unix @spell@ command, it
  161 -- prints each word that it does not recognize.
  162 --
  163 -- @
  164 -- import Data.Maybe (mapMaybe)
  165 -- import qualified Data.BloomFilter.Easy as B
  166 --
  167 -- main = do
  168 --   filt \<- B.'easyList' 0.01 . words \<$> readFile "\/usr\/share\/dict\/words"
  169 --   let check word | B.'B.elem' word filt  = Nothing
  170 --                  | otherwise         = Just word
  171 --   interact (unlines . mapMaybe check . lines)
  172 -- @