Skip to content

Commit b50fcc8

Browse files
runtime: don't hold scheduler lock when calling timeSleepUntil
Otherwise, we can get into a deadlock: sysmon takes the scheduler lock and calls timeSleepUntil which takes each P's timer lock. Simultaneously, some P calls runtimer (holding the P's own timer lock) which wakes up the scavenger, calling goready, calling wakep, calling startm, getting the scheduler lock. Now the sysmon thread is holding the scheduler lock and trying to get a P's timer lock, while some other thread running on that P is holding the P's timer lock and trying to get the scheduler lock. So change sysmon to call timeSleepUntil without holding the scheduler lock, and change timeSleepUntil to use allpLock, which is only held for limited periods of time and should never compete with timer locks. This hopefully Fixes #35375 At least it should fix the linux-arm64-packet builder problems, which occurred more reliably as that system has GOMAXPROCS == 96, giving a lot more scope for this deadlock. Change-Id: I7a7917daf7a4882e0b27ca416e4f6300cfaaa774 Reviewed-on: https://go-review.googlesource.com/c/go/+/205558 Run-TryBot: Ian Lance Taylor <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Cherry Zhang <[email protected]> Reviewed-by: Michael Knyszek <[email protected]>
1 parent cf3be9b commit b50fcc8

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

src/runtime/proc.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -4452,10 +4452,10 @@ func sysmon() {
44524452
}
44534453
usleep(delay)
44544454
now := nanotime()
4455+
next := timeSleepUntil()
44554456
if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
44564457
lock(&sched.lock)
44574458
if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
4458-
next := timeSleepUntil()
44594459
if next > now {
44604460
atomic.Store(&sched.sysmonwait, 1)
44614461
unlock(&sched.lock)
@@ -4474,6 +4474,7 @@ func sysmon() {
44744474
osRelax(false)
44754475
}
44764476
now = nanotime()
4477+
next = timeSleepUntil()
44774478
lock(&sched.lock)
44784479
atomic.Store(&sched.sysmonwait, 0)
44794480
noteclear(&sched.sysmonnote)
@@ -4505,7 +4506,7 @@ func sysmon() {
45054506
incidlelocked(1)
45064507
}
45074508
}
4508-
if timeSleepUntil() < now {
4509+
if next < now {
45094510
// There are timers that should have already run,
45104511
// perhaps because there is an unpreemptible P.
45114512
// Try to start an M to run them.

src/runtime/time.go

+11
Original file line numberDiff line numberDiff line change
@@ -1234,14 +1234,24 @@ func timejumpLocked() *g {
12341234
return tb.gp
12351235
}
12361236

1237+
// timeSleepUntil returns the time when the next timer should fire.
1238+
// This is only called by sysmon.
12371239
func timeSleepUntil() int64 {
12381240
if oldTimers {
12391241
return timeSleepUntilOld()
12401242
}
12411243

12421244
next := int64(maxWhen)
12431245

1246+
// Prevent allp slice changes. This is like retake.
1247+
lock(&allpLock)
12441248
for _, pp := range allp {
1249+
if pp == nil {
1250+
// This can happen if procresize has grown
1251+
// allp but not yet created new Ps.
1252+
continue
1253+
}
1254+
12451255
lock(&pp.timersLock)
12461256
c := atomic.Load(&pp.adjustTimers)
12471257
for _, t := range pp.timers {
@@ -1276,6 +1286,7 @@ func timeSleepUntil() int64 {
12761286
}
12771287
unlock(&pp.timersLock)
12781288
}
1289+
unlock(&allpLock)
12791290

12801291
return next
12811292
}

0 commit comments

Comments
 (0)