Skip to content

Improve Int{Map,Set} construction from asc lists #1123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions containers-tests/benchmarks/IntMap.hs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ main = do
, bench "mapMaybeWithKey" $ whnf (M.mapMaybeWithKey (const maybeDel)) m
, bench "fromList" $ whnf M.fromList elems
, bench "fromAscList" $ whnf M.fromAscList elems
, bench "fromDistinctAscList" $ whnf M.fromDistinctAscList elems
, bench "fromAscList:fusion" $
whnf (\n -> M.fromAscList [(i,()) | i <- [1..n]]) bound
, bench "minView" $ whnf (maybe 0 (\((k,v), m) -> k+v+M.size m) . M.minViewWithKey)
(M.fromList $ zip [1..10] [1..10])
, bench "spanAntitone" $ whnf (M.spanAntitone (<key_mid)) m
Expand All @@ -61,17 +62,18 @@ main = do
where
elems = elems_hits
elems_hits = zip keys values
elems_mid = zip (map (+ (2^12 `div` 2)) keys) values
elems_most = zip (map (+ (2^12 `div` 10)) keys) values
elems_mid = zip (map (+ (bound `div` 2)) keys) values
elems_most = zip (map (+ (bound `div` 10)) keys) values
elems_misses = zip (map (\x-> x * 2 + 1) keys) values
elems_mixed = zip mixedKeys values
--------------------------------------------------------
keys = [1..2^12]
bound = 2^12
keys = [1..bound]
keys' = fmap (+ 1000000) keys
keys'' = fmap (* 2) [1..2^12]
keys'' = fmap (* 2) [1..bound]
mixedKeys = interleave keys keys'
values = [1..2^12]
key_mid = 2^11
values = [1..bound]
key_mid = bound `div` 2
--------------------------------------------------------
sum k v1 v2 = k + v1 + v2
consPair k v xs = (k, v) : xs
Expand Down
12 changes: 10 additions & 2 deletions containers-tests/benchmarks/IntSet.hs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,11 @@ main = do
, bench "fromList" $ whnf IS.fromList elems
, bench "fromRange" $ whnf IS.fromRange (1,bound)
, bench "fromRange:small" $ whnf IS.fromRange (-1,0)
, bench "fromAscList" $ whnf IS.fromAscList elems
, bench "fromDistinctAscList" $ whnf IS.fromDistinctAscList elems
, bench "fromAscList" $ whnf fromAscListNoinline elems
, bench "fromAscList:fusion" $ whnf (\n -> IS.fromAscList [1..n]) bound
, bench "fromAscList:sparse" $ whnf fromAscListNoinline elems_sparse
, bench "fromAscList:sparse:fusion" $
whnf (\n -> IS.fromAscList (map (*64) [1..n])) bound
, bench "disjoint:false" $ whnf (IS.disjoint s) s_even
, bench "disjoint:true" $ whnf (IS.disjoint s_odd) s_even
, bench "null.intersection:false" $ whnf (IS.null. IS.intersection s) s_even
Expand Down Expand Up @@ -81,6 +84,11 @@ ins xs s0 = foldl' (\s a -> IS.insert a s) s0 xs
del :: [Int] -> IS.IntSet -> IS.IntSet
del xs s0 = foldl' (\s k -> IS.delete k s) s0 xs

-- NOINLINE to work around an issue where the inlined function doesn't get
-- optimized (see GHC #25878).
fromAscListNoinline :: [Int] -> IS.IntSet
fromAscListNoinline = IS.fromAscList
{-# NOINLINE fromAscListNoinline #-}


-- | Automata contain just the transitions
Expand Down
47 changes: 40 additions & 7 deletions containers-tests/tests/intmap-properties.hs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import qualified Prelude (map, filter)

import Data.List (nub,sort)
import qualified Data.List as List
import qualified Data.List.NonEmpty as NE
import qualified Data.IntSet as IntSet
import Test.Tasty
import Test.Tasty.HUnit
Expand Down Expand Up @@ -157,7 +158,6 @@ main = defaultMain $ testGroup "intmap-properties"
, testProperty "mergeWithKey model" prop_mergeWithKeyModel
, testProperty "merge valid" prop_merge_valid
, testProperty "mergeA effects" prop_mergeA_effects
, testProperty "fromAscList" prop_ordered
, testProperty "fromList then toList" prop_list
, testProperty "toDescList" prop_descList
, testProperty "toAscList+toDescList" prop_ascDescList
Expand Down Expand Up @@ -248,6 +248,10 @@ main = defaultMain $ testGroup "intmap-properties"
, testProperty "mapKeysWith" prop_mapKeysWith
, testProperty "mapKeysMonotonic" prop_mapKeysMonotonic
, testProperty "compare" prop_compare
, testProperty "fromAscList" prop_fromAscList
, testProperty "fromAscListWith" prop_fromAscListWith
, testProperty "fromAscListWithKey" prop_fromAscListWithKey
, testProperty "fromDistinctAscList" prop_fromDistinctAscList
]

{--------------------------------------------------------------------
Expand Down Expand Up @@ -1402,12 +1406,6 @@ prop_mergeA_effects xs ys

----------------------------------------------------------------

prop_ordered :: Property
prop_ordered
= forAll (choose (5,100)) $ \n ->
let xs = [(x,()) | x <- [0..n::Int]]
in fromAscList xs == fromList xs

prop_list :: [Int] -> Bool
prop_list xs = (sort (nub xs) == [x | (x,()) <- toList (fromList [(x,()) | x <- xs])])

Expand Down Expand Up @@ -1984,3 +1982,38 @@ prop_mapKeysMonotonic (Positive a) b m =

prop_compare :: IntMap OrdA -> IntMap OrdA -> Property
prop_compare m1 m2 = compare m1 m2 === compare (toList m1) (toList m2)

prop_fromAscList :: [(Int, A)] -> Property
prop_fromAscList kxs =
valid t .&&.
t === fromList sortedKxs
where
sortedKxs = List.sortBy (comparing fst) kxs
t = fromAscList sortedKxs

prop_fromAscListWith :: Fun (A, A) A -> [(Int, A)] -> Property
prop_fromAscListWith f kxs =
valid t .&&.
t === fromListWith (applyFun2 f) sortedKxs
where
sortedKxs = List.sortBy (comparing fst) kxs
t = fromAscListWith (applyFun2 f) sortedKxs

prop_fromAscListWithKey :: Fun (Int, A, A) A -> [(Int, A)] -> Property
prop_fromAscListWithKey f kxs =
valid t .&&.
t === fromListWithKey (applyFun3 f) sortedKxs
where
sortedKxs = List.sortBy (comparing fst) kxs
t = fromAscListWithKey (applyFun3 f) sortedKxs

prop_fromDistinctAscList :: [(Int, A)] -> Property
prop_fromDistinctAscList kxs =
valid t .&&.
toList t === nubSortedKxs
where
nubSortedKxs =
List.map NE.head $
NE.groupBy ((==) `on` fst) $
List.sortBy (comparing fst) kxs
t = fromDistinctAscList nubSortedKxs
24 changes: 19 additions & 5 deletions containers-tests/tests/intset-properties.hs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ main = defaultMain $ testGroup "intset-properties"
, testProperty "prop_difference" prop_difference
, testProperty "prop_intersection" prop_intersection
, testProperty "prop_symmetricDifference" prop_symmetricDifference
, testProperty "prop_Ordered" prop_Ordered
, testProperty "prop_List" prop_List
, testProperty "prop_DescList" prop_DescList
, testProperty "prop_AscDescList" prop_AscDescList
Expand Down Expand Up @@ -90,6 +89,8 @@ main = defaultMain $ testGroup "intset-properties"
, testProperty "delete" prop_delete
, testProperty "deleteMin" prop_deleteMin
, testProperty "deleteMax" prop_deleteMax
, testProperty "fromAscList" prop_fromAscList
, testProperty "fromDistinctAscList" prop_fromDistinctAscList
]

----------------------------------------------------------------
Expand Down Expand Up @@ -281,10 +282,6 @@ prop_disjoint a b = a `disjoint` b == null (a `intersection` b)
{--------------------------------------------------------------------
Lists
--------------------------------------------------------------------}
prop_Ordered
= forAll (choose (5,100)) $ \n ->
let xs = concat [[i-n,i-n]|i<-[0..2*n :: Int]]
in fromAscList xs == fromList xs

prop_List :: [Int] -> Bool
prop_List xs
Expand Down Expand Up @@ -521,3 +518,20 @@ prop_deleteMin s = toList (deleteMin s) === if null s then [] else tail (toList

prop_deleteMax :: IntSet -> Property
prop_deleteMax s = toList (deleteMax s) === if null s then [] else init (toList s)

prop_fromAscList :: [Int] -> Property
prop_fromAscList xs =
valid t .&&.
toList t === nubSortedXs
where
sortedXs = sort xs
nubSortedXs = List.map NE.head $ NE.group sortedXs
t = fromAscList sortedXs

prop_fromDistinctAscList :: [Int] -> Property
prop_fromDistinctAscList xs =
valid t .&&.
toList t === nubSortedXs
where
nubSortedXs = List.map NE.head $ NE.group $ sort xs
t = fromDistinctAscList nubSortedXs
168 changes: 91 additions & 77 deletions containers/src/Data/IntMap/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,10 @@ module Data.IntMap.Internal (
, bin
, binCheckLeft
, binCheckRight
, MonoState(..)
, Stack(..)
, ascLinkTop
, ascLinkAll

-- * Used by "IntMap.Merge.Lazy" and "IntMap.Merge.Strict"
, mapWhenMissing
Expand Down Expand Up @@ -3385,8 +3389,8 @@ fromListWithKey f xs
-- > fromAscList [(3,"b"), (5,"a"), (5,"b")] == fromList [(3, "b"), (5, "b")]

fromAscList :: [(Key,a)] -> IntMap a
fromAscList = fromMonoListWithKey Nondistinct (\_ x _ -> x)
{-# NOINLINE fromAscList #-}
fromAscList xs = fromAscListWithKey (\_ x _ -> x) xs
{-# INLINE fromAscList #-} -- Inline for list fusion

-- | \(O(n)\). Build a map from a list of key\/value pairs where
-- the keys are in ascending order, with a combining function on equal keys.
Expand All @@ -3400,8 +3404,8 @@ fromAscList = fromMonoListWithKey Nondistinct (\_ x _ -> x)
-- Also see the performance note on 'fromListWith'.

fromAscListWith :: (a -> a -> a) -> [(Key,a)] -> IntMap a
fromAscListWith f = fromMonoListWithKey Nondistinct (\_ x y -> f x y)
{-# NOINLINE fromAscListWith #-}
fromAscListWith f xs = fromAscListWithKey (\_ x y -> f x y) xs
{-# INLINE fromAscListWith #-} -- Inline for list fusion

-- | \(O(n)\). Build a map from a list of key\/value pairs where
-- the keys are in ascending order, with a combining function on equal keys.
Expand All @@ -3415,86 +3419,66 @@ fromAscListWith f = fromMonoListWithKey Nondistinct (\_ x y -> f x y)
--
-- Also see the performance note on 'fromListWith'.

-- See Note [fromAscList implementation]
fromAscListWithKey :: (Key -> a -> a -> a) -> [(Key,a)] -> IntMap a
fromAscListWithKey f = fromMonoListWithKey Nondistinct f
{-# NOINLINE fromAscListWithKey #-}
fromAscListWithKey f xs = ascLinkAll (Foldable.foldl' next MSNada xs)
where
next s (!ky, y) = case s of
MSNada -> MSPush ky y Nada
MSPush kx x stk
| kx == ky -> MSPush ky (f ky y x) stk
| otherwise -> let m = branchMask kx ky
in MSPush ky y (ascLinkTop stk kx (Tip kx x) m)
{-# INLINE fromAscListWithKey #-} -- Inline for list fusion

-- | \(O(n)\). Build a map from a list of key\/value pairs where
-- the keys are in ascending order and all distinct.
--
-- __Warning__: This function should be used only if the keys are in
-- strictly increasing order. This precondition is not checked. Use 'fromList'
-- if the precondition may not hold.
--
-- > fromDistinctAscList [(3,"b"), (5,"a")] == fromList [(3, "b"), (5, "a")]

fromDistinctAscList :: [(Key,a)] -> IntMap a
fromDistinctAscList = fromMonoListWithKey Distinct (\_ x _ -> x)
{-# NOINLINE fromDistinctAscList #-}

-- | \(O(n)\). Build a map from a list of key\/value pairs with monotonic keys
-- and a combining function.
-- @fromDistinctAscList = 'fromAscList'@
--
-- The precise conditions under which this function works are subtle:
-- For any branch mask, keys with the same prefix w.r.t. the branch
-- mask must occur consecutively in the list.
-- See warning on 'fromAscList'.
--
-- Also see the performance note on 'fromListWith'.

fromMonoListWithKey :: Distinct -> (Key -> a -> a -> a) -> [(Key,a)] -> IntMap a
fromMonoListWithKey distinct f = go
where
go [] = Nil
go ((kx,vx) : zs1) = addAll' kx vx zs1

-- `addAll'` collects all keys equal to `kx` into a single value,
-- and then proceeds with `addAll`.
addAll' !kx vx []
= Tip kx vx
addAll' !kx vx ((ky,vy) : zs)
| Nondistinct <- distinct, kx == ky
= let v = f kx vy vx in addAll' ky v zs
-- inlined: | otherwise = addAll kx (Tip kx vx) (ky : zs)
| m <- branchMask kx ky
, Inserted ty zs' <- addMany' m ky vy zs
= addAll kx (linkWithMask m ky ty kx (Tip kx vx)) zs'

-- for `addAll` and `addMany`, kx is /a/ key inside the tree `tx`
-- `addAll` consumes the rest of the list, adding to the tree `tx`
addAll !_kx !tx []
= tx
addAll !kx !tx ((ky,vy) : zs)
| m <- branchMask kx ky
, Inserted ty zs' <- addMany' m ky vy zs
= addAll kx (linkWithMask m ky ty kx tx) zs'

-- `addMany'` is similar to `addAll'`, but proceeds with `addMany'`.
addMany' !_m !kx vx []
= Inserted (Tip kx vx) []
addMany' !m !kx vx zs0@((ky,vy) : zs)
| Nondistinct <- distinct, kx == ky
= let v = f kx vy vx in addMany' m ky v zs
-- inlined: | otherwise = addMany m kx (Tip kx vx) (ky : zs)
| mask kx m /= mask ky m
= Inserted (Tip kx vx) zs0
| mxy <- branchMask kx ky
, Inserted ty zs' <- addMany' mxy ky vy zs
= addMany m kx (linkWithMask mxy ky ty kx (Tip kx vx)) zs'

-- `addAll` adds to `tx` all keys whose prefix w.r.t. `m` agrees with `kx`.
addMany !_m !_kx tx []
= Inserted tx []
addMany !m !kx tx zs0@((ky,vy) : zs)
| mask kx m /= mask ky m
= Inserted tx zs0
| mxy <- branchMask kx ky
, Inserted ty zs' <- addMany' mxy ky vy zs
= addMany m kx (linkWithMask mxy ky ty kx tx) zs'
{-# INLINE fromMonoListWithKey #-}

data Inserted a = Inserted !(IntMap a) ![(Key,a)]

data Distinct = Distinct | Nondistinct
-- This definition exists for backwards compatibility. It offers no advantage
-- over @fromAscList@.
fromDistinctAscList :: [(Key,a)] -> IntMap a
-- Note: There is nothing we can optimize compared to fromAscList.
-- The adjacent key equals check (kx == ky) might seem unnecessary for
-- fromDistinctAscList, but it guards branchMask which has undefined behavior
-- under that case. We could error on kx == ky instead, but that isn't any
-- better.
fromDistinctAscList = fromAscList
{-# INLINE fromDistinctAscList #-} -- Inline for list fusion

data Stack a
= Nada
| Push {-# UNPACK #-} !Int !(IntMap a) !(Stack a)

data MonoState a
= MSNada
| MSPush {-# UNPACK #-} !Key a !(Stack a)

ascLinkTop :: Stack a -> Int -> IntMap a -> Int -> Stack a
ascLinkTop stk !rk r !rm = case stk of
Nada -> Push rm r stk
Push m l stk'
| i2w m < i2w rm -> let p = Prefix (mask rk m .|. m)
in ascLinkTop stk' rk (Bin p l r) rm
| otherwise -> Push rm r stk

ascLinkAll :: MonoState a -> IntMap a
ascLinkAll s = case s of
MSNada -> Nil
MSPush kx x stk -> ascLinkStack stk kx (Tip kx x)
{-# INLINABLE ascLinkAll #-}

ascLinkStack :: Stack a -> Int -> IntMap a -> IntMap a
ascLinkStack stk !rk r = case stk of
Nada -> r
Push m l stk'
| signBranch p -> Bin p r l
| otherwise -> ascLinkStack stk' rk (Bin p l r)
where
p = Prefix (mask rk m .|. m)

{--------------------------------------------------------------------
Eq
Expand Down Expand Up @@ -3884,3 +3868,33 @@ withEmpty bars = " ":bars
-- * This is similar to the Map merge complexity, which is O(m log (n/m)).
-- * When m is a small constant the term simplifies to O(min(n, W)), which is
-- just the complexity we expect for single operations like insert and delete.

-- Note [fromAscList implementation]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- fromAscList is an implementation that builds up the result bottom-up
-- in linear time. It maintains a state (MonoState) that gets updated with
-- key-value pairs from the input list one at a time. The state contains the
-- last key-value pair, and a stack of pending trees.
--
-- For a new key-value pair, the branchMask with the previous key is computed.
-- This represents the depth of the lowest common ancestor that the tree with
-- the previous key, say tl, and the tree with the new key, tr, must have in
-- the final result. Since the keys are in ascending order we expect no more
-- keys in tl, and we can build it by moving up the stack and linking trees. We
-- know when to stop by the branchMask value. We must not link higher than that
-- depth, otherwise instead of tl we will build the parent of tl prematurely
-- before tr is ready. Once the linking is done, tl will be at the top of the
-- stack.
--
-- We also store the branchMask of a tree with its future right sibling in the
-- stack. This is an optimization, benchmarks show that this is faster than
-- recomputing the branchMask values when linking trees.
--
-- In the end, we link all the trees remaining in the stack. There is a small
-- catch: negative keys appear in the input before non-negative keys (if they
-- both appear), but the tree with negative keys and the tree with non-negative
-- keys must be the right and left child of the root respectively. So we check
-- for this and link them accordingly.
--
-- The implementation is defined as a foldl' over the input list, which makes
-- it a good consumer in list fusion.
Loading