Skip to content

Commit d36276d

Browse files
authored
p2p/dnsdisc: fix hot-spin when all trees are empty (#22313)
In the random sync algorithm used by the DNS node iterator, we first pick a random tree and then perform one sync action on that tree. This happens in a loop until any node is found. If no trees contain any nodes, the iterator will enter a hot loop spinning at 100% CPU. The fix is complicated. The iterator now checks if a meaningful sync action can be performed on any tree. If there is nothing to do, it waits for the next root record recheck time to arrive and then tries again. Fixes #22306
1 parent 6ec1561 commit d36276d

File tree

3 files changed

+138
-19
lines changed

3 files changed

+138
-19
lines changed

p2p/dnsdisc/client.go

Lines changed: 69 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,11 @@ type randomIterator struct {
217217
c *Client
218218

219219
mu sync.Mutex
220-
trees map[string]*clientTree // all trees
221220
lc linkCache // tracks tree dependencies
221+
trees map[string]*clientTree // all trees
222+
// buffers for syncableTrees
223+
syncableList []*clientTree
224+
disabledList []*clientTree
222225
}
223226

224227
func (c *Client) newRandomIterator() *randomIterator {
@@ -238,10 +241,10 @@ func (it *randomIterator) Node() *enode.Node {
238241

239242
// Close closes the iterator.
240243
func (it *randomIterator) Close() {
244+
it.cancelFn()
245+
241246
it.mu.Lock()
242247
defer it.mu.Unlock()
243-
244-
it.cancelFn()
245248
it.trees = nil
246249
}
247250

@@ -264,7 +267,7 @@ func (it *randomIterator) addTree(url string) error {
264267
// nextNode syncs random tree entries until it finds a node.
265268
func (it *randomIterator) nextNode() *enode.Node {
266269
for {
267-
ct := it.nextTree()
270+
ct := it.pickTree()
268271
if ct == nil {
269272
return nil
270273
}
@@ -282,26 +285,79 @@ func (it *randomIterator) nextNode() *enode.Node {
282285
}
283286
}
284287

285-
// nextTree returns a random tree.
286-
func (it *randomIterator) nextTree() *clientTree {
288+
// pickTree returns a random tree to sync from.
289+
func (it *randomIterator) pickTree() *clientTree {
287290
it.mu.Lock()
288291
defer it.mu.Unlock()
289292

293+
// Rebuild the trees map if any links have changed.
290294
if it.lc.changed {
291295
it.rebuildTrees()
292296
it.lc.changed = false
293297
}
294-
if len(it.trees) == 0 {
295-
return nil
298+
299+
for {
300+
canSync, trees := it.syncableTrees()
301+
switch {
302+
case canSync:
303+
// Pick a random tree.
304+
return trees[rand.Intn(len(trees))]
305+
case len(trees) > 0:
306+
// No sync action can be performed on any tree right now. The only meaningful
307+
// thing to do is waiting for any root record to get updated.
308+
if !it.waitForRootUpdates(trees) {
309+
// Iterator was closed while waiting.
310+
return nil
311+
}
312+
default:
313+
// There are no trees left, the iterator was closed.
314+
return nil
315+
}
296316
}
297-
limit := rand.Intn(len(it.trees))
317+
}
318+
319+
// syncableTrees finds trees on which any meaningful sync action can be performed.
320+
func (it *randomIterator) syncableTrees() (canSync bool, trees []*clientTree) {
321+
// Resize tree lists.
322+
it.syncableList = it.syncableList[:0]
323+
it.disabledList = it.disabledList[:0]
324+
325+
// Partition them into the two lists.
298326
for _, ct := range it.trees {
299-
if limit == 0 {
300-
return ct
327+
if ct.canSyncRandom() {
328+
it.syncableList = append(it.syncableList, ct)
329+
} else {
330+
it.disabledList = append(it.disabledList, ct)
301331
}
302-
limit--
303332
}
304-
return nil
333+
if len(it.syncableList) > 0 {
334+
return true, it.syncableList
335+
}
336+
return false, it.disabledList
337+
}
338+
339+
// waitForRootUpdates waits for the closest scheduled root check time on the given trees.
340+
func (it *randomIterator) waitForRootUpdates(trees []*clientTree) bool {
341+
var minTree *clientTree
342+
var nextCheck mclock.AbsTime
343+
for _, ct := range trees {
344+
check := ct.nextScheduledRootCheck()
345+
if minTree == nil || check < nextCheck {
346+
minTree = ct
347+
nextCheck = check
348+
}
349+
}
350+
351+
sleep := nextCheck.Sub(it.c.clock.Now())
352+
it.c.cfg.Logger.Debug("DNS iterator waiting for root updates", "sleep", sleep, "tree", minTree.loc.domain)
353+
timeout := it.c.clock.NewTimer(sleep)
354+
defer timeout.Stop()
355+
select {
356+
case <-timeout.C():
357+
return true
358+
case <-it.ctx.Done():
359+
return false // Iterator was closed.
360+
}
305361
}
306362

307363
// rebuildTrees rebuilds the 'trees' map.

p2p/dnsdisc/client_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,53 @@ func TestIteratorRootRecheckOnFail(t *testing.T) {
231231
checkIterator(t, it, nodes)
232232
}
233233

234+
// This test checks that the iterator works correctly when the tree is initially empty.
235+
func TestIteratorEmptyTree(t *testing.T) {
236+
var (
237+
clock = new(mclock.Simulated)
238+
nodes = testNodes(nodesSeed1, 1)
239+
resolver = newMapResolver()
240+
c = NewClient(Config{
241+
Resolver: resolver,
242+
Logger: testlog.Logger(t, log.LvlTrace),
243+
RecheckInterval: 20 * time.Minute,
244+
RateLimit: 500,
245+
})
246+
)
247+
c.clock = clock
248+
tree1, url := makeTestTree("n", nil, nil)
249+
tree2, _ := makeTestTree("n", nodes, nil)
250+
resolver.add(tree1.ToTXT("n"))
251+
252+
// Start the iterator.
253+
node := make(chan *enode.Node)
254+
it, err := c.NewIterator(url)
255+
if err != nil {
256+
t.Fatal(err)
257+
}
258+
go func() {
259+
it.Next()
260+
node <- it.Node()
261+
}()
262+
263+
// Wait for the client to get stuck in waitForRootUpdates.
264+
clock.WaitForTimers(1)
265+
266+
// Now update the root.
267+
resolver.add(tree2.ToTXT("n"))
268+
269+
// Wait for it to pick up the root change.
270+
clock.Run(c.cfg.RecheckInterval)
271+
select {
272+
case n := <-node:
273+
if n.ID() != nodes[0].ID() {
274+
t.Fatalf("wrong node returned")
275+
}
276+
case <-time.After(5 * time.Second):
277+
t.Fatal("it.Next() did not unblock within 5s of real time")
278+
}
279+
}
280+
234281
// updateSomeNodes applies ENR updates to some of the given nodes.
235282
func updateSomeNodes(keySeed int64, nodes []*enode.Node) {
236283
keys := testKeys(nodesSeed1, len(nodes))

p2p/dnsdisc/sync.go

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ import (
2525
"github.com/ethereum/go-ethereum/p2p/enode"
2626
)
2727

28-
const (
29-
rootRecheckFailCount = 5 // update root if this many leaf requests fail
30-
)
28+
// This is the number of consecutive leaf requests that may fail before
29+
// we consider re-resolving the tree root.
30+
const rootRecheckFailCount = 5
3131

3232
// clientTree is a full tree being synced.
3333
type clientTree struct {
@@ -89,13 +89,22 @@ func (ct *clientTree) syncRandom(ctx context.Context) (n *enode.Node, err error)
8989
ct.gcLinks()
9090

9191
// Sync next random entry in ENR tree. Once every node has been visited, we simply
92-
// start over. This is fine because entries are cached.
92+
// start over. This is fine because entries are cached internally by the client LRU
93+
// also by DNS resolvers.
9394
if ct.enrs.done() {
9495
ct.enrs = newSubtreeSync(ct.c, ct.loc, ct.root.eroot, false)
9596
}
9697
return ct.syncNextRandomENR(ctx)
9798
}
9899

100+
// canSyncRandom checks if any meaningful action can be performed by syncRandom.
101+
func (ct *clientTree) canSyncRandom() bool {
102+
// Note: the check for non-zero leaf count is very important here.
103+
// If we're done syncing all nodes, and no leaves were found, the tree
104+
// is empty and we can't use it for sync.
105+
return ct.rootUpdateDue() || !ct.links.done() || !ct.enrs.done() || ct.enrs.leaves != 0
106+
}
107+
99108
// gcLinks removes outdated links from the global link cache. GC runs once
100109
// when the link sync finishes.
101110
func (ct *clientTree) gcLinks() {
@@ -184,10 +193,14 @@ func (ct *clientTree) updateRoot(ctx context.Context) error {
184193
// rootUpdateDue returns true when a root update is needed.
185194
func (ct *clientTree) rootUpdateDue() bool {
186195
tooManyFailures := ct.leafFailCount > rootRecheckFailCount
187-
scheduledCheck := ct.c.clock.Now().Sub(ct.lastRootCheck) > ct.c.cfg.RecheckInterval
196+
scheduledCheck := ct.c.clock.Now() >= ct.nextScheduledRootCheck()
188197
return ct.root == nil || tooManyFailures || scheduledCheck
189198
}
190199

200+
func (ct *clientTree) nextScheduledRootCheck() mclock.AbsTime {
201+
return ct.lastRootCheck.Add(ct.c.cfg.RecheckInterval)
202+
}
203+
191204
// slowdownRootUpdate applies a delay to root resolution if is tried
192205
// too frequently. This avoids busy polling when the client is offline.
193206
// Returns true if the timeout passed, false if sync was canceled.
@@ -218,10 +231,11 @@ type subtreeSync struct {
218231
root string
219232
missing []string // missing tree node hashes
220233
link bool // true if this sync is for the link tree
234+
leaves int // counter of synced leaves
221235
}
222236

223237
func newSubtreeSync(c *Client, loc *linkEntry, root string, link bool) *subtreeSync {
224-
return &subtreeSync{c, loc, root, []string{root}, link}
238+
return &subtreeSync{c, loc, root, []string{root}, link, 0}
225239
}
226240

227241
func (ts *subtreeSync) done() bool {
@@ -253,10 +267,12 @@ func (ts *subtreeSync) resolveNext(ctx context.Context, hash string) (entry, err
253267
if ts.link {
254268
return nil, errENRInLinkTree
255269
}
270+
ts.leaves++
256271
case *linkEntry:
257272
if !ts.link {
258273
return nil, errLinkInENRTree
259274
}
275+
ts.leaves++
260276
case *branchEntry:
261277
ts.missing = append(ts.missing, e.children...)
262278
}

0 commit comments

Comments
 (0)