Skip to content

Commit 6a33ad1

Browse files
committed
Fix deadlock; circular buffer fail and DeleteRange while in async (needs tests)
1 parent 254fe55 commit 6a33ad1

File tree

1 file changed

+58
-15
lines changed

1 file changed

+58
-15
lines changed

log_cache_async.go

Lines changed: 58 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -102,21 +102,23 @@ func (c *LogCacheAsync) LastIndex() (uint64, error) {
102102
return atomic.LoadUint64(&c.lastIndex), nil
103103
}
104104

105-
// GetLog gets a log entry at a given index.
106-
func (c *LogCacheAsync) GetLog(index uint64, log *Log) error {
107-
// Quick check to see if it's even possibly in the cache so we avoid locking
108-
// at all in the case of scanning through old records.
105+
// minPossibleIdx is the lowest log we could possibly have cached. We might
106+
// not have that low because we just started or because we are currently
107+
// writing a batch over the top, but it's a lower bound.
108+
func (c *LogCacheAsync) minPossibleIdx() uint64 {
109109
lastIdx := atomic.LoadUint64(&c.lastIndex)
110-
111-
// minPossibleIdx is the lowest log we could possibly have cached. We might
112-
// not have that low because we just started or because we are currently
113-
// writing a batch over the top but below this we know it won't be in cache so
114-
// there is no need to lock at all.
115110
minPossibleIdx := uint64(1)
116111
if lastIdx > c.size {
117112
minPossibleIdx = lastIdx - c.size
118113
}
119-
if index < minPossibleIdx {
114+
return minPossibleIdx
115+
}
116+
117+
// GetLog gets a log entry at a given index.
118+
func (c *LogCacheAsync) GetLog(index uint64, log *Log) error {
119+
// Quick check to see if it's even possibly in the cache so we avoid locking
120+
// at all in the case of scanning through old records.
121+
if index < c.minPossibleIdx() {
120122
return c.store.GetLog(index, log)
121123
}
122124

@@ -170,15 +172,50 @@ func (c *LogCacheAsync) StoreLogs(logs []*Log) error {
170172
return nil
171173
}
172174

173-
// DeleteRange deletes a range of log entries. The range is inclusive. We only
174-
// support DeleteRange calls in Sync mode which makes reasoning about the cache
175-
// simple because the cache can't contain any un-flushed writes in sync mode.
175+
// DeleteRange deletes a range of log entries. The range is inclusive. We need
176+
// to support deletions in both sync and async mode since leader needs to be
177+
// able to truncate logs while in Async mode. In async mode though we can't
178+
// clear the cache as it may have unflushed logs. It has always been OK to call
179+
// DeleteRange and StoreLogs concurrently though since Raft truncates in a
180+
// background goroutine so we don't have to involve the flusher thread in the
181+
// delete. We do need to be mindful though that the underlying LogStore probably
182+
// used a lock to serialise DeleteRange and StoreLogs so our call here could
183+
// deadlock with the flusher thread if we delete while holding the cache state
184+
// lock.
176185
func (c *LogCacheAsync) DeleteRange(min uint64, max uint64) error {
177186
c.state.Lock()
178187

179188
if c.state.completionCh != nil {
189+
// ASYNC MODE!
190+
191+
// Release the state lock while we delete to reduce contention if this is
192+
// slow.
180193
c.state.Unlock()
181-
return errors.New("call to sync DeleteRange when in async mode")
194+
err := c.store.DeleteRange(min, max)
195+
if err != nil {
196+
return err
197+
}
198+
199+
// First check if the truncate could have any impact on any on the cached
200+
// records.
201+
if max < c.minPossibleIdx() {
202+
// range deleted was entirely below the cached range so we are done.
203+
return nil
204+
}
205+
206+
// It's possible (but not certain) that some of the deleted range could be
207+
// in cache. Check if any logs in the deleted range are present and remove
208+
// them if they are.
209+
c.state.Lock()
210+
for idx := min; idx <= max; idx++ {
211+
l := c.state.cache[idx&c.sizeMask]
212+
if l != nil && l.Index == idx {
213+
// Delete it!
214+
c.state.cache[idx&c.sizeMask] = nil
215+
}
216+
}
217+
c.state.Unlock()
218+
return nil
182219
}
183220
// Invalidate the cache
184221
c.state.cache = make([]*Log, c.size)
@@ -289,12 +326,16 @@ func (c *LogCacheAsync) doFlush(logs []*Log, start time.Time) *LogWriteCompletio
289326
err := c.store.StoreLogs(logs)
290327

291328
lwc := LogWriteCompletion{
329+
// Initialize this to the current persistentIndex in case we fail to write.
330+
// We could Load but no need since we know it must be the one before the
331+
// first log we are writing now.
292332
PersistentIndex: logs[0].Index - 1,
293333
Error: err,
294334
Duration: time.Since(start),
295335
}
296336
if err == nil {
297337
lwc.PersistentIndex = logs[len(logs)-1].Index
338+
atomic.StoreUint64(&c.persistentIndex, logs[len(logs)-1].Index)
298339
}
299340
return &lwc
300341
}
@@ -347,8 +388,10 @@ func (c *LogCacheAsync) StoreLogsAsync(logs []*Log) error {
347388
// this lets us get notified about when it's free. Note that even though we
348389
// unlock and it's _possible_ for another StoreLogsAsync call to be made,
349390
doneCh := make(chan struct{})
350-
c.state.triggerChan <- syncRequest{startTime: start, doneCh: doneCh}
391+
// Unlock before we send since we might block if the flusher is busy but it
392+
// won't be able to complete without the lock.
351393
c.state.Unlock()
394+
c.state.triggerChan <- syncRequest{startTime: start, doneCh: doneCh}
352395
<-doneCh
353396
c.state.Lock()
354397
// Reload the indexes now sync is done so we can check if there is space

0 commit comments

Comments
 (0)