never executed always true always false
    1 -- |
    2 --
    3 -- A fast, space efficient Bloom filter implementation.  A Bloom
    4 -- filter is a set-like data structure that provides a probabilistic
    5 -- membership test.
    6 --
    7 -- * Queries do not give false negatives.  When an element is added to
    8 --   a filter, a subsequent membership test will definitely return
    9 --   'True'.
   10 --
   11 -- * False positives /are/ possible.  If an element has not been added
   12 --   to a filter, a membership test /may/ nevertheless indicate that
   13 --   the element is present.
   14 --
   15 -- This module provides low-level control.  For an easier to use
   16 -- interface, see the "Data.BloomFilter.Easy" module.
   17 
   18 module Data.BloomFilter (
   19     -- * Overview
   20     -- $overview
   21 
   22     -- ** Ease of use
   23     -- $ease
   24 
   25     -- ** Performance
   26     -- $performance
   27 
   28     -- ** Differences from bloomfilter package
   29     -- $differences
   30 
   31     -- * Types
   32     Hash,
   33     Bloom,
   34     MBloom,
   35     Bloom',
   36     MBloom',
   37     CheapHashes,
   38     RealHashes,
   39 
   40     -- * Immutable Bloom filters
   41 
   42     -- ** Conversion
   43     freeze,
   44     thaw,
   45     unsafeFreeze,
   46 
   47     -- ** Creation
   48     unfold,
   49 
   50     fromList,
   51     empty,
   52     singleton,
   53 
   54     -- ** Accessors
   55     length,
   56     elem,
   57     elemHashes,
   58     notElem,
   59 ) where
   60 
   61 import           Control.Exception (assert)
   62 import           Control.Monad (forM_, liftM)
   63 import           Control.Monad.ST (ST, runST)
   64 import           Data.BloomFilter.Hash (CheapHashes, Hash, Hashable,
   65                      Hashes (..), RealHashes)
   66 import           Data.BloomFilter.Internal (Bloom' (..), bloomInvariant)
   67 import           Data.BloomFilter.Mutable (MBloom, MBloom', insert, new)
   68 import qualified Data.BloomFilter.Mutable.Internal as MB
   69 import           Data.Word (Word64)
   70 
   71 import           Prelude hiding (elem, length, notElem)
   72 
   73 import qualified Data.BloomFilter.BitVec64 as V
   74 
   75 -- | Bloom filter using 'CheapHashes' hashing scheme.
   76 type Bloom = Bloom' CheapHashes
   77 
   78 -- | Create an immutable Bloom filter, using the given setup function
   79 -- which executes in the 'ST' monad.
   80 --
   81 -- Example:
   82 --
   83 -- @
   84 -- TODO
   85 --import "Data.BloomFilter.Hash" (cheapHashes)
   86 --
   87 --filter = create (cheapHashes 3) 1024 $ \mf -> do
   88 --           insertMB mf \"foo\"
   89 --           insertMB mf \"bar\"
   90 -- @
   91 --
   92 -- Note that the result of the setup function is not used.
   93 create :: Int        -- ^ number of hash functions to use
   94         -> Word64                 -- ^ number of bits in filter
   95         -> (forall s. (MBloom' s h a -> ST s ()))  -- ^ setup function
   96         -> Bloom' h a
   97 {-# INLINE create #-}
   98 create hash numBits body = runST $ do
   99     mb <- new hash numBits
  100     body mb
  101     unsafeFreeze mb
  102 
  103 -- | Create an immutable Bloom filter from a mutable one.  The mutable
  104 -- filter may be modified afterwards.
  105 freeze :: MBloom' s h a -> ST s (Bloom' h a)
  106 freeze mb = do
  107     ba <- V.freeze (MB.bitArray mb)
  108     let !bf = Bloom (MB.hashesN mb) (MB.size mb) ba
  109     assert (bloomInvariant bf) $ pure bf
  110 
  111 -- | Create an immutable Bloom filter from a mutable one.  The mutable
  112 -- filter /must not/ be modified afterwards, or a runtime crash may
  113 -- occur.  For a safer creation interface, use 'freeze' or 'create'.
  114 unsafeFreeze :: MBloom' s h a -> ST s (Bloom' h a)
  115 unsafeFreeze mb = do
  116     ba <- V.unsafeFreeze (MB.bitArray mb)
  117     let !bf = Bloom (MB.hashesN mb) (MB.size mb) ba
  118     assert (bloomInvariant bf) $ pure bf
  119 
  120 -- | Copy an immutable Bloom filter to create a mutable one.  There is
  121 -- no non-copying equivalent.
  122 thaw :: Bloom' h a -> ST s (MBloom' s h a)
  123 thaw ub = MB.MBloom (hashesN ub) (size ub) `liftM` V.thaw (bitArray ub)
  124 
  125 -- | Create an empty Bloom filter.
  126 empty :: Int                    -- ^ number of hash functions to use
  127       -> Word64                 -- ^ number of bits in filter
  128       -> Bloom' h a
  129 {-# INLINE [1] empty #-}
  130 empty hash numBits = create hash numBits (\_ -> return ())
  131 
  132 -- | Create a Bloom filter with a single element.
  133 singleton :: (Hashes h, Hashable a)
  134           => Int               -- ^ number of hash functions to use
  135           -> Word64            -- ^ number of bits in filter
  136           -> a                 -- ^ element to insert
  137           -> Bloom' h a
  138 singleton hash numBits elt = create hash numBits (\mb -> insert mb elt)
  139 
  140 -- | Query an immutable Bloom filter for membership.  If the value is
  141 -- present, return @True@.  If the value is not present, there is
  142 -- /still/ some possibility that @True@ will be returned.
  143 elem :: (Hashes h, Hashable a) => a -> Bloom' h a -> Bool
  144 elem elt ub = elemHashes (makeHashes elt) ub
  145 {-# SPECIALIZE elem :: Hashable a => a -> Bloom a -> Bool #-}
  146 
  147 -- | Query an immutable Bloom filter for membership using already constructed 'Hashes' value.
  148 elemHashes :: Hashes h => h a -> Bloom' h a -> Bool
  149 elemHashes !ch !ub = go 0 where
  150     go :: Int -> Bool
  151     go !i | i >= hashesN ub
  152           = True
  153     go !i = let idx' :: Word64
  154                 !idx' = evalHashes ch i in
  155             let idx :: Int
  156                 !idx = fromIntegral (idx' `V.unsafeRemWord64` size ub) in
  157             -- While the idx' can cover the full Word64 range,
  158             -- after taking the remainder, it now must fit in
  159             -- and Int because it's less than the filter size.
  160             if V.unsafeIndex (bitArray ub) idx
  161               then go (i + 1)
  162               else False
  163 {-# SPECIALIZE elemHashes :: CheapHashes a -> Bloom a -> Bool #-}
  164 
  165 -- | Query an immutable Bloom filter for non-membership.  If the value
  166 -- /is/ present, return @False@.  If the value is not present, there
  167 -- is /still/ some possibility that @False@ will be returned.
  168 notElem :: (Hashes h, Hashable a) => a -> Bloom' h a -> Bool
  169 notElem elt ub = notElemHashes (makeHashes elt) ub
  170 
  171 -- | Query an immutable Bloom filter for non-membership using already constructed 'Hashes' value.
  172 notElemHashes :: Hashes h => h a -> Bloom' h a -> Bool
  173 notElemHashes !ch !ub = not (elemHashes ch ub)
  174 
  175 -- | Return the size of an immutable Bloom filter, in bits.
  176 length :: Bloom' h a -> Word64
  177 length = size
  178 
  179 -- | Build an immutable Bloom filter from a seed value.  The seeding
  180 -- function populates the filter as follows.
  181 --
  182 --   * If it returns 'Nothing', it is finished producing values to
  183 --     insert into the filter.
  184 --
  185 --   * If it returns @'Just' (a,b)@, @a@ is added to the filter and
  186 --     @b@ is used as a new seed.
  187 unfold :: forall a b h. (Hashes h, Hashable a)
  188        => Int                       -- ^ number of hash functions to use
  189        -> Word64                    -- ^ number of bits in filter
  190        -> (b -> Maybe (a, b))       -- ^ seeding function
  191        -> b                         -- ^ initial seed
  192        -> Bloom' h a
  193 {-# INLINE unfold #-}
  194 unfold hs numBits f k = create hs numBits (loop k)
  195   where loop :: forall s. b -> MBloom' s h a -> ST s ()
  196         loop j mb = case f j of
  197                       Just (a, j') -> insert mb a >> loop j' mb
  198                       _            -> return ()
  199 
  200 -- | Create an immutable Bloom filter, populating it from a list of
  201 -- values.
  202 --
  203 -- Here is an example that uses the @cheapHashes@ function from the
  204 -- "Data.BloomFilter.Hash" module to create a hash function that
  205 -- returns three hashes.
  206 --
  207 -- @
  208 -- filt = fromList 3 1024 [\"foo\", \"bar\", \"quux\"]
  209 -- @
  210 fromList :: (Hashes h, Hashable a)
  211          => Int                -- ^ number of hash functions to use
  212          -> Word64             -- ^ number of bits in filter
  213          -> [a]                -- ^ values to populate with
  214          -> Bloom' h a
  215 fromList hs numBits list = create hs numBits $ forM_ list . insert
  216 
  217 -- $overview
  218 --
  219 -- Each of the functions for creating Bloom filters accepts two parameters:
  220 --
  221 -- * The number of bits that should be used for the filter.  Note that
  222 --   a filter is fixed in size; it cannot be resized after creation.
  223 --
  224 -- * A number of hash functions, /k/, to be used for the filter.
  225 --
  226 -- By choosing these parameters with care, it is possible to tune for
  227 -- a particular false positive rate.
  228 -- The 'Data.BloomFilter.Easy.suggestSizing' function in
  229 -- the "Data.BloomFilter.Easy" module calculates useful estimates for
  230 -- these parameters.
  231 
  232 -- $ease
  233 --
  234 -- This module provides immutable interfaces for working with a
  235 -- query-only Bloom filter, and for converting to and from mutable
  236 -- Bloom filters.
  237 --
  238 -- For a higher-level interface that is easy to use, see the
  239 -- "Data.BloomFilter.Easy" module.
  240 
  241 -- $performance
  242 --
  243 -- The implementation has been carefully tuned for high performance
  244 -- and low space consumption.
  245 
  246 -- $differences
  247 --
  248 -- This package is (almost entirely rewritten) fork of
  249 -- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package.
  250 --
  251 -- The main differences are
  252 --
  253 -- * This packages support bloomfilters of arbitrary sizes
  254 --   (not limited to powers of two). Also sizes over 2^32 are supported.
  255 --
  256 -- * The 'Bloom' and 'MBloom' types are parametrised over 'Hashes' variable,
  257 --   instead of having a @a -> ['Hash']@ typed field.
  258 --   This separation allows clean de/serialization of Bloom filters in this
  259 --   package, as the hashing scheme is a static.
  260 --
  261 -- * [XXH3 hash](https://xxhash.com/) is used instead of Jenkins'
  262 --   lookup3.