Skip to content

p2p/dnsdisc: fix hot-spin when all trees are empty #22313

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 19, 2021
65 changes: 54 additions & 11 deletions p2p/dnsdisc/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,10 @@ type randomIterator struct {
cancelFn context.CancelFunc
c *Client

mu sync.Mutex
trees map[string]*clientTree // all trees
lc linkCache // tracks tree dependencies
mu sync.Mutex
trees map[string]*clientTree // all trees
rootWait map[string]*clientTree // trees waiting for root change
lc linkCache // tracks tree dependencies
}

func (c *Client) newRandomIterator() *randomIterator {
Expand Down Expand Up @@ -264,7 +265,7 @@ func (it *randomIterator) addTree(url string) error {
// nextNode syncs random tree entries until it finds a node.
func (it *randomIterator) nextNode() *enode.Node {
for {
ct := it.nextTree()
ct := it.pickTree()
if ct == nil {
return nil
}
Expand All @@ -282,8 +283,8 @@ func (it *randomIterator) nextNode() *enode.Node {
}
}

// nextTree returns a random tree.
func (it *randomIterator) nextTree() *clientTree {
// pickTree returns a random tree to sync from.
func (it *randomIterator) pickTree() *clientTree {
it.mu.Lock()
defer it.mu.Unlock()

Expand All @@ -294,14 +295,56 @@ func (it *randomIterator) nextTree() *clientTree {
if len(it.trees) == 0 {
return nil
}
limit := rand.Intn(len(it.trees))

for {
// Find trees that might still have pending items to sync.
// If there are any, pick a random syncable tree.
syncable, disabled := it.syncableTrees()
if len(syncable) > 0 {
return syncable[rand.Intn(len(syncable))]
}
// The client tried all trees, and no sync action can be performed on any of them.
// The only meaningful thing to do now is waiting for any root record to get
// updated.
if !it.waitForRootUpdates(disabled) {
return nil // Iterator was closed.
}
}
}

// syncableTrees finds trees on which any meaningful sync action can be performed.
func (it *randomIterator) syncableTrees() (syncable, disabled []*clientTree) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still want optimize this to reuse the same slices over and over, but let's review the basic idea first.

for _, ct := range it.trees {
if limit == 0 {
return ct
if ct.canSyncRandom() {
syncable = append(syncable, ct)
} else {
disabled = append(disabled, ct)
}
limit--
}
return nil
return syncable, disabled
}

// waitForRootUpdates waits for the closest scheduled root check time on the given trees.
func (it *randomIterator) waitForRootUpdates(trees []*clientTree) bool {
var nextCheck mclock.AbsTime
now := it.c.clock.Now()
for _, ct := range trees {
check := ct.nextScheduledRootCheck()
if nextCheck == 0 || check < nextCheck {
nextCheck = check
}
}

sleep := nextCheck.Sub(now)
it.c.cfg.Logger.Debug("DNS iterator waiting for root updates", "sleep", sleep)
timeout := it.c.clock.NewTimer(sleep)
defer timeout.Stop()
select {
case <-timeout.C():
return true
case <-it.ctx.Done():
return false // Iterator was closed.
}
}

// rebuildTrees rebuilds the 'trees' map.
Expand Down
51 changes: 51 additions & 0 deletions p2p/dnsdisc/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"errors"
"math/rand"
"reflect"
"runtime"
"testing"
"time"

Expand Down Expand Up @@ -231,6 +232,56 @@ func TestIteratorRootRecheckOnFail(t *testing.T) {
checkIterator(t, it, nodes)
}

// This test checks that the iterator works correctly when the tree is initially empty.
func TestIteratorEmptyTree(t *testing.T) {
var (
clock = new(mclock.Simulated)
nodes = testNodes(nodesSeed1, 1)
resolver = newMapResolver()
c = NewClient(Config{
Resolver: resolver,
Logger: testlog.Logger(t, log.LvlTrace),
RecheckInterval: 20 * time.Minute,
RateLimit: 500,
})
)
c.clock = clock
tree1, url := makeTestTree("n", nil, nil)
tree2, url := makeTestTree("n", nodes, nil)
resolver.add(tree1.ToTXT("n"))

// Start the iterator.
node := make(chan *enode.Node)
it, err := c.NewIterator(url)
if err != nil {
t.Fatal(err)
}
go func() {
it.Next()
node <- it.Node()
}()

// Wait for it to get stuck in slowdownRollover, then modify the root.
clock.WaitForTimers(1)
resolver.add(tree2.ToTXT("n"))

timeout := time.After(5 * time.Second)
for {
clock.Run(1 * time.Second)
select {
case n := <-node:
if n.ID() != nodes[0].ID() {
t.Fatalf("wrong node returned")
}
return
case <-timeout:
t.Fatal("it.Next() did not unblock within 5s of real time")
default:
runtime.Gosched()
}
}
}

// updateSomeNodes applies ENR updates to some of the given nodes.
func updateSomeNodes(keySeed int64, nodes []*enode.Node) {
keys := testKeys(nodesSeed1, len(nodes))
Expand Down
28 changes: 22 additions & 6 deletions p2p/dnsdisc/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ import (
"github.com/ethereum/go-ethereum/p2p/enode"
)

const (
rootRecheckFailCount = 5 // update root if this many leaf requests fail
)
// This is the number of consecutive leaf requests that may fail before
// we consider re-resolving the tree root.
const rootRecheckFailCount = 5

// clientTree is a full tree being synced.
type clientTree struct {
Expand Down Expand Up @@ -89,13 +89,22 @@ func (ct *clientTree) syncRandom(ctx context.Context) (n *enode.Node, err error)
ct.gcLinks()

// Sync next random entry in ENR tree. Once every node has been visited, we simply
// start over. This is fine because entries are cached.
// start over. This is fine because entries are cached internally by the client LRU
// also by DNS resolvers.
if ct.enrs.done() {
ct.enrs = newSubtreeSync(ct.c, ct.loc, ct.root.eroot, false)
}
return ct.syncNextRandomENR(ctx)
}

// canSyncRandom checks if any meaningful action can be performed by syncRandom.
func (ct *clientTree) canSyncRandom() bool {
// Note: the check for non-zero leaf count is very important here.
// If we're done syncing all nodes, and no leaves were found, the tree
// is empty and we can't use it for sync.
return ct.rootUpdateDue() || !ct.links.done() || !ct.enrs.done() || ct.enrs.leaves != 0
}

// gcLinks removes outdated links from the global link cache. GC runs once
// when the link sync finishes.
func (ct *clientTree) gcLinks() {
Expand Down Expand Up @@ -184,10 +193,14 @@ func (ct *clientTree) updateRoot(ctx context.Context) error {
// rootUpdateDue returns true when a root update is needed.
func (ct *clientTree) rootUpdateDue() bool {
tooManyFailures := ct.leafFailCount > rootRecheckFailCount
scheduledCheck := ct.c.clock.Now().Sub(ct.lastRootCheck) > ct.c.cfg.RecheckInterval
scheduledCheck := ct.c.clock.Now() >= ct.nextScheduledRootCheck()
return ct.root == nil || tooManyFailures || scheduledCheck
}

func (ct *clientTree) nextScheduledRootCheck() mclock.AbsTime {
return ct.lastRootCheck.Add(ct.c.cfg.RecheckInterval)
}

// slowdownRootUpdate applies a delay to root resolution if is tried
// too frequently. This avoids busy polling when the client is offline.
// Returns true if the timeout passed, false if sync was canceled.
Expand Down Expand Up @@ -218,10 +231,11 @@ type subtreeSync struct {
root string
missing []string // missing tree node hashes
link bool // true if this sync is for the link tree
leaves int // counter of synced leaves
}

func newSubtreeSync(c *Client, loc *linkEntry, root string, link bool) *subtreeSync {
return &subtreeSync{c, loc, root, []string{root}, link}
return &subtreeSync{c, loc, root, []string{root}, link, 0}
}

func (ts *subtreeSync) done() bool {
Expand Down Expand Up @@ -253,10 +267,12 @@ func (ts *subtreeSync) resolveNext(ctx context.Context, hash string) (entry, err
if ts.link {
return nil, errENRInLinkTree
}
ts.leaves++
case *linkEntry:
if !ts.link {
return nil, errLinkInENRTree
}
ts.leaves++
case *branchEntry:
ts.missing = append(ts.missing, e.children...)
}
Expand Down