Updating expanding union benchmark's measurement scope

recursion-ninja · recursion-ninja · commit 4101260f04a4 · 2025-05-02T12:51:58.000-04:00
diff --git a/bench/macro/lsm-tree-bench-unions.hs b/bench/macro/lsm-tree-bench-unions.hs
@@ -81,8 +81,6 @@ import qualified Text.Read.Lex as Lex
 import           Database.LSMTree.Extras (groupsOfN)
 import           Database.LSMTree.Internal.ByteString (byteArrayToSBS)
 
-import System.Environment
-
 -- We should be able to write this benchmark
 -- using only use public lsm-tree interface
 import qualified Database.LSMTree.Simple as LSM
@@ -144,14 +142,14 @@ data GlobalOpts = GlobalOpts
     { rootDir     :: !FilePath  -- ^ session directory.
     , tableCount  :: !Int -- ^ Number of  tables in the benchmark
     , initialSize :: !Int
+    , seed        :: !Word64
     }
   deriving stock Show
 
 data RunOpts = RunOpts
     { batchCount :: !Int
     , batchSize  :: !Int
     , check      :: !Bool
-    , seed       :: !Word64
     , pipelined  :: !Bool
     , payRate    :: !PaymentRate
     }
@@ -196,6 +194,7 @@ globalOptsP = pure GlobalOpts
     <*> O.option O.str (O.long "bench-dir" <> O.value (Fold.fold ["_", benchPerformanceOf, "_", benchWorkProductNo]) <> O.showDefault <> O.help "Benchmark directory to put files in")
     <*> O.option O.auto (O.long "table-count" <> O.value 10 <> O.showDefault <> O.help "Number of tables to benchmark")
     <*> O.option O.auto (O.long "initial-size" <> O.value 1_000_000 <> O.showDefault <> O.help "Initial LSM tree size")
+    <*> O.option O.auto (O.long "seed" <> O.value 1337 <> O.showDefault <> O.help "Random seed")
 
 cmdP :: O.Parser Cmd
 cmdP = O.subparser $ mconcat
@@ -216,7 +215,6 @@ runOptsP = pure RunOpts
     <*> O.option O.auto (O.long "batch-count" <> O.value 200 <> O.showDefault <> O.help "Batch count")
     <*> O.option O.auto (O.long "batch-size" <> O.value 256 <> O.showDefault <> O.help "Batch size")
     <*> O.switch (O.long "check" <> O.help "Check generated key distribution")
-    <*> O.option O.auto (O.long "seed" <> O.value 1337 <> O.showDefault <> O.help "Random seed")
     <*> O.switch (O.long "pipelined" <> O.help "Use pipelined mode")
     <*> O.option O.auto (O.long "payment-rate" <> O.value 1 <> O.showDefault <> O.help "Debt repayment rate")
 
@@ -385,17 +383,31 @@ doSetup' gopts = do
     -- Ensure that our mount point exists on the real file system
     createDirectoryIfMissing True rooting
 
+    -- Define some constants
+    let populationBatchSize = 256
+        keyMax = 2 * initialSize gopts
+        keyMin = 1
+
+    -- Create an RNG for randomized deletions
+    refRNG <- newIORef $ MCG.make
+        (toEnum populationBatchSize)
+        (seed gopts)
+
+    -- Populate the specified number of tables
     forM_ (tableRange gopts) $ \tID -> do
         let name = makeTableName tID
         LSM.withSession (rootDir gopts) $ \session -> do
+            -- Create a new table
             tbl <- LSM.newTable @K @V session
-
-            forM_ (groupsOfN 256 [ 1 .. 2 * initialSize gopts ]) $ \batch -> do
-                let (valuesDeletes, _valuesUpserts) = NE.splitAt 128 batch
+            -- Populate the tablke in batches
+            forM_ (groupsOfN populationBatchSize [ keyMin .. keyMax ]) $ \batch -> do
+                -- Insert all values in the batch
                 LSM.inserts tbl $ V.fromList [
                       (makeKey (fromIntegral k), theValue)
                     | k <- NE.toList batch
                     ]
+                -- Randomly delete half the values of the batch
+                let (valuesDeletes,_) = NE.splitAt 128 batch
                 LSM.deletes tbl $ V.fromList [
                       makeKey (fromIntegral k)
                     | k <- valuesDeletes
@@ -422,7 +434,7 @@ doDryRun' gopts opts = do
     -- calculated some expected statistics for generated batches
     -- using nested do block to limit scope of intermediate bindings n, d, p, and q
     do
-       let d = toInteger (maxBound :: Word64)
+       let d = toInteger $ 2 * initialSize gopts
        -- we generate n random numbers in range of [ 1 .. d ]
        -- what is the chance they are all distinct
        -- In this case each key in a table is could possibly share a key in another table.
@@ -480,7 +492,7 @@ renderRational len rat = sign <> shows prefix ("." ++ suffix)
       suffix = case next of
         0 -> "0"
         n -> take len $ go n
-        
+
       num = numerator rat
       den = denominator rat
       go 0 = ""
@@ -553,21 +565,28 @@ toOperations lookups = batch1
 
 doRun :: GlobalOpts -> RunOpts -> IO ()
 doRun gopts opts = do
-    -- 100 ticks for all tables
-    let PaymentRate paymentRate = payRate opts
-        steps = 100
+    -- Perform 3 measurement phases
+    --   * Phase 1: Measure performance before supplying any credits.
+    --   * Phase 2: Measure performance as credits are incrementally supplied and debt is repaid.
+    --   * Phase 3: Measure performance when debt is 0.
+    let tickCountPrefix = 20
+        tickCountMiddle = 100
+        tickCountSuffix = 20
+        tickCountEnding = maximum indicesPhase3
+        indicesPhase1 = negate <$> reverse [ 0 .. tickCountPrefix ]
+        indicesPhase2 = [ 1 .. tickCountMiddle ]
+        indicesPhase3 = [ tickCountMiddle + 1 .. tickCountMiddle + tickCountSuffix ]
+        PaymentRate paymentRate = payRate opts
         benchmarkIterations h
           | pipelined opts = pipelinedIterations h
           | otherwise = sequentialIterations h
 
-    print $ deriveFileNameForPlot gopts opts
-
-    refRNG <- newIORef $ initGen 
+    refRNG <- newIORef $ initGen
                 (initialSize gopts)
                 (batchSize opts)
                 (batchCount opts)
-                (seed opts)
-    
+                (seed gopts)
+
     putStrLn "Operations per second:"
     measurements <- LSM.withSession (rootDir gopts) $ \session ->
       withLatencyHandle $ \h -> do
@@ -579,78 +598,114 @@ doRun gopts opts = do
 
         LSM.withIncrementalUnions tables $ \table -> do
           LSM.UnionDebt totalDebt <- LSM.remainingUnionDebt table
-          -- Determine the number of credits to supply per tick
-          -- in order to have all debt repaid when 90% complete.
+          -- Determine the number of credits to supply per tick in order to
+          -- all debt repaid at the time specified by the rpayment rate.
+          -- Each tick should supply credits equal to:
+          --     paymentRate * totalDebt / tickCountMiddle
           let paymentPerTick = ceiling $ product
-                [ paymentRate,  toInteger totalDebt % 1, 1 % steps ]
-          let tickCredit = LSM.UnionCredits paymentPerTick
-          forM [1 .. steps] $ \step -> do
+                [ paymentRate,  toInteger totalDebt % 1, 1 % tickCountMiddle ]
+
+          let measurePerformance :: Integer -> IO (Int, Int, Double)
+              measurePerformance tickIndex = do
+                -- Note this tick's debt for subsequent measurement purposes.
+                LSM.UnionDebt debtCurr <- LSM.remainingUnionDebt table
+                -- Note the cumulative credits supplied through this tick.
+                let paidCurr = max 0 $ totalDebt - fromInteger (max 0 tickIndex) * paymentPerTick
+                currRNG <- readIORef refRNG
+                (nextRNG,time, _, _) <- benchmarkIterations
+                    h
+                    (\_ _ -> pure ())
+                    (initialSize gopts)
+                    (batchSize opts)
+                    (batchCount opts)
+                    currRNG
+                    table
+                -- Update the RNG state
+                writeIORef refRNG nextRNG
+                -- Perform measurement of batched lookups
+                -- Save the result for later to be included in the performance plot
+                let ops = batchCount opts * batchSize opts
+                    rate = fromIntegral ops / time
+                -- Print a status report while running the benchmark
+                printf
+                  (Fold.fold [
+                    "    [%",
+                    show . length $ show tickCountEnding,
+                    "d/",
+                    show tickCountEnding,
+                    "]:    %7.01f ops/sec",
+                    "    with debt = %8d\n"
+                  ])
+                  tickIndex
+                  rate
+                  debtCurr
+                pure (debtCurr, paidCurr, rate)
+
+          -- Phase 1 measurements: Debt = 100%
+          resultsPhase1 <- forM indicesPhase1 $ \step -> do
+            measurePerformance step
+
+          -- Phase 2 measurements: Debt ∈ [0%, 99%]
+          resultsPhase2 <- forM indicesPhase2 $ \step -> do
             LSM.UnionDebt debtPrev <- LSM.remainingUnionDebt table
-            when (debtPrev > 0) . void $ 
-              LSM.supplyUnionCredits table tickCredit
-            LSM.UnionDebt debtCurr <- LSM.remainingUnionDebt table
-            currRNG <- readIORef refRNG
-            (nextRNG,time, _, _) <- benchmarkIterations
-                h
-                (\_ _ -> pure ())
-                (initialSize gopts)
-                (batchSize opts)
-                (batchCount opts)
-                currRNG
-                table
-            writeIORef refRNG nextRNG
-
-            -- Perform measurement of batched lookups
-            -- Save the result for later to be included in the performance plot
-            let ops = batchCount opts * batchSize opts
-                rate = fromIntegral ops / time
-            -- Print a status report while running the benchmark
-            printf
-              (Fold.fold [
-                "    [%",
-                show . length $ show steps,
-                "d/",
-                show steps,
-                "]:    %7.01f ops/sec",
-                "    with debt = %8d\n"
-              ])
-              step
-              rate
-              debtCurr
-            pure (debtCurr, rate)
-
-    let (balances', operations) = unzip measurements
-        maxDebit = toInteger $ head balances'
+            -- When there is debt remaining, supply the fixed credits-per-tick.
+            when (debtPrev > 0) . void $
+              LSM.supplyUnionCredits table (LSM.UnionCredits paymentPerTick)
+            measurePerformance step
+
+          -- Phase 3 measurements: Debt = 0%
+          resultsPhase3 <- forM indicesPhase3 $ \step -> do
+            measurePerformance step
+
+          pure $ mconcat [ resultsPhase1, resultsPhase2, resultsPhase3 ]
+
+    let (balances', payments', operations) = unzip3 measurements
         maxValue = ceiling $ maximum operations
-        balances = (\b -> fromRational $ (fromIntegral b * maxValue) % maxDebit) <$> balances'
+        standardize xs =
+          let maxInput = toInteger $ maximum xs
+              scale x = fromRational $ (fromIntegral x * maxValue) % maxInput
+          in  scale <$> xs
+        balances = standardize balances'
+        payments = standardize payments'
 
     -- Generate a performance plot based on the benchmark results.
-    Plot.toFile Plot.def (rootDir gopts <> "/benchmark.png") $ do
+    Plot.toFile Plot.def (rootDir gopts <> "/" <> deriveFileNameForPlot gopts opts) $ do
       Plot.layout_title .= "Incremental Unions Performance"
       Plot.layout_x_axis . Plot.laxis_override .= Plot.axisGridHide
       Plot.layout_x_axis . Plot.laxis_title    .= "Credits supplied over time"
       Plot.layout_y_axis . Plot.laxis_title    .= "Lookup access time"
-      Plot.plot $ fillBetween "Debt balance" [ (d,(0,v)) | (d, v) <- zip [1 :: Word .. ] balances ]
+      let colorD = Color.sRGB 0.875 1.0 0.125 `Plot.withOpacity` 0.5
+      let colorE = Color.sRGB 0.625 1.0 0.875 `Plot.withOpacity` 0.5
+      Plot.plot $ fillBetween colorD "Debt balance"
+        [ (d,(0,v)) | (d, v) <- zip [1 :: Word .. ] balances ]
+      Plot.plot $ fillBetween colorE "Extra credits"
+        [ (d,(v,w)) | (d, v, w) <- zip3 [1 :: Word .. ] balances payments ]
       Plot.plot $ Plot.line "operations per second" [ zip [1 :: Word .. ] operations ]
 
-fillBetween :: String -> [(x1, (y1, y1))] -> Plot.EC l20 (Plot.PlotFillBetween x1 y1)
-fillBetween title vs = Plot.liftEC $ do
+fillBetween :: Plot.AlphaColour Double -> String -> [(x, (y, y))] -> Plot.EC l20 (Plot.PlotFillBetween x y)
+fillBetween color title vs = Plot.liftEC $ do
   Plot.plot_fillbetween_title .= title
-  let color = Color.sRGB 0.875 1.0 0.125 `Plot.withOpacity` 0.5
   Plot.plot_fillbetween_style .= Plot.solidFillStyle color
   Plot.plot_fillbetween_values .= vs
 
 deriveFileNameForPlot :: GlobalOpts -> RunOpts -> FilePath
 deriveFileNameForPlot gOpts rOpts =
-    let partTable = show $ tableCount gOpts
-        partWidth = List.intercalate "_" . fmap Fold.toList . groupsOfN 3 . reverse . show $ initialSize gOpts
-    in  Fold.fold
+    let sep1000th = reverse . List.intercalate "_" . fmap Fold.toList . groupsOfN 3 . reverse . show
+        partTable = show $ tableCount gOpts
+        partWidth = sep1000th $ initialSize gOpts
+        partSeed0 = printf "SEED_%016x" (seed gOpts)
+        partRatio =
+          let PaymentRate r = payRate rOpts
+              n = numerator r
+              d = denominator r
+              sep = "x"
+          in  show n <> sep <> show d
+    in  List.intercalate "-"
           [ "benchmark"
-          , partTable
-          , "×"
-          , partWidth
-          , ".png"
-          ]
+          , partTable <> "x" <> partWidth
+          , partSeed0
+          , partRatio
+          ] <> ".png"
 
 {-
 data GlobalOpts = GlobalOpts
@@ -703,15 +758,6 @@ sequentialIterations h output !initialSize !batchSize !batchCount !currRNG !tbl
     (x,y,z) <- timed_ $ forM_ (zip [0 ..] allBatches) $ sequentialIteration h output tbl
     pure (nextRNG,x,y,z)
 
-{-
-generateBatch  ::
-       Int       -- ^ initial size of the collection
-    -> Int       -- ^ batch size
-    -> MCG.MCG   -- ^ generator
-    -> Int       -- ^ batch number
-    -> (MCG.MCG, V.Vector K)
-
--}
 -------------------------------------------------------------------------------
 -- pipelined
 -------------------------------------------------------------------------------
@@ -764,7 +810,7 @@ pipelinedIteration :: LatencyHandle
                    -> MVar (LSM.Table K V)
                    -> MVar (V.Vector K)
                    -> MVar (V.Vector K)
-                   -> MVar [V.Vector K] 
+                   -> MVar [V.Vector K]
                    -> LSM.Table K V
                    -> Int
                    -> IO (LSM.Table K V)
@@ -788,7 +834,7 @@ pipelinedIteration h output
       -- using tbl_n. They used it to generate tbl_n+1 (which they gave us).
       LSM.closeTable tbl_n
       pure tbl_n1
-  
+
     ls_next <- dequeue queue
     putMVar syncTblOut tbl_n1
     putMVar syncVecOut ls_next
@@ -853,7 +899,7 @@ pipelinedIterations h output !initialSize !batchSize !batchCount !currRNG tbl_0
 
 dequeue :: Monoid a => MVar [a] -> IO a
 dequeue q = modifyMVar q $ pure . swap . fromMaybe (mempty, []) . List.uncons
-    
+
 -------------------------------------------------------------------------------
 -- measure batch latency
 -------------------------------------------------------------------------------
@@ -965,7 +1011,6 @@ main = do
     putStrLn "         To benchmark in release mode, pass:"
     putStrLn "         --project-file=cabal.project.release"
 #endif
-    getArgs >>= Fold.traverse_ print
     (gopts, cmd) <- O.customExecParser prefs cliP
     print gopts
     print cmd