Skip to content

Commit 2f021d5

Browse files
committed
darwin/arm64: use the number of perf cores as the default number of workers
Starting around macOS v15 the parallel performance of the APFS file system appears slightly improved and now the ideal number of fastwalk workers is generally the number of performance cores currently available. Therefore, this commit changes the default number of workers on darwin/arm64 to be the value of sysctl("hw.perflevel0.physicalcpu") which is the number of currently available performance cores (link to xnu source below). https://github.com/apple-oss-distributions/xnu/blob/43a90889846e00bfb5cf1d255cdc0a701a1e05a4/bsd/sys/sysctl.h#L1244
1 parent f7a2749 commit 2f021d5

File tree

5 files changed

+137
-13
lines changed

5 files changed

+137
-13
lines changed

fastwalk.go

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -77,18 +77,23 @@ func DefaultNumWorkers() int {
7777
if numCPU < 4 {
7878
return 4
7979
}
80+
// User manually set GOMAXPROCS - respect it.
81+
if numCPU != runtime.NumCPU() {
82+
return min(numCPU, 32)
83+
}
8084
// Darwin IO performance on APFS can slow with increased parallelism.
81-
// Depending on CPU, stat(2) performance is best around 4-10 workers
82-
// and file IO is best around 4 workers. More workers only benefit CPU
83-
// intensive tasks.
84-
//
85-
// NB(Charlie): As of macOS 15, the parallel performance of readdir_r(3)
86-
// and stat(2) calls has improved and is now generally the number of
87-
// performance cores (on ARM Macs).
85+
// For Intel CPUs (and maybe older arm64 CPUs) performance is best
86+
// around 4-10 workers and file IO is best around 4 workers. More workers
87+
// only benefit CPU intensive tasks.
8888
//
89-
// TODO: Consider using the value of sysctl("hw.perflevel0.physicalcpu").
90-
// TODO: Find someone with a Mac Studio to test higher core counts.
89+
// As of macOS 15 (on ARM Macs), the parallel performance of readdir_r(3)
90+
// and stat(2) calls has improved and the ideal number of workers is now
91+
// generally the number of performance cores.
9192
if runtime.GOOS == "darwin" {
93+
if n := darwinNumPerfCores(); n > 0 {
94+
return n
95+
}
96+
// This is primarily for Intel CPUs.
9297
switch {
9398
case numCPU <= 8:
9499
return 4
@@ -98,10 +103,7 @@ func DefaultNumWorkers() int {
98103
return 10
99104
}
100105
}
101-
if numCPU > 32 {
102-
return 32
103-
}
104-
return numCPU
106+
return min(numCPU, 32)
105107
}
106108

107109
// DefaultToSlash returns true if this is a Go program compiled for Windows

fastwalk_test.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@ import (
1010
"io/fs"
1111
"math"
1212
"os"
13+
"os/exec"
1314
"os/user"
1415
"path/filepath"
1516
"reflect"
1617
"regexp"
1718
"runtime"
19+
"slices"
1820
"sort"
1921
"strconv"
2022
"strings"
@@ -1343,6 +1345,83 @@ func TestSkipAll(t *testing.T) {
13431345
}
13441346
}
13451347

1348+
func TestDefaultNumWorkers(t *testing.T) {
1349+
if os.Getenv("FASTWALK_TEST_COOKIE") == "123456" {
1350+
fmt.Printf("## DEFAULT_NUM_WORKERS: %d\n", fastwalk.DefaultNumWorkers())
1351+
return
1352+
}
1353+
1354+
exe, err := os.Executable()
1355+
if err != nil {
1356+
t.Fatal(err)
1357+
}
1358+
1359+
re := regexp.MustCompile(`## DEFAULT_NUM_WORKERS:\s+(\d+)`)
1360+
parseOut := func(t *testing.T, out []byte) int {
1361+
a := re.FindSubmatch(out)
1362+
if len(a) != 2 {
1363+
t.Fatal(`Sub-test output does not contain: "## DEFAULT_NUM_WORKERS"`)
1364+
}
1365+
n, err := strconv.Atoi(string(a[1]))
1366+
if err != nil {
1367+
t.Fatal(err)
1368+
}
1369+
return n
1370+
}
1371+
1372+
runTest := func(t *testing.T, maxProcs, want int) {
1373+
cmd := exec.Command(exe, "-test.run", "^TestDefaultNumWorkers$")
1374+
cmd.Env = cmd.Environ()
1375+
cmd.Env = slices.DeleteFunc(cmd.Env, func(s string) bool {
1376+
return strings.HasPrefix(s, "GOMAXPROCS=") || strings.HasPrefix(s, "GOFLAGS=")
1377+
})
1378+
cmd.Env = append(cmd.Env, "FASTWALK_TEST_COOKIE=123456")
1379+
if maxProcs > 0 {
1380+
cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%d", maxProcs))
1381+
}
1382+
out, err := cmd.CombinedOutput()
1383+
if err != nil {
1384+
t.Fatalf("test failed: %s", bytes.TrimSpace(out))
1385+
}
1386+
got := parseOut(t, out)
1387+
if got != want {
1388+
t.Fatalf("got: %d want: %d", got, want)
1389+
}
1390+
}
1391+
1392+
t.Run("GOMAXPROCS", func(t *testing.T) {
1393+
for _, procs := range []int{2, 4, 8, 20, 64} {
1394+
t.Run(strconv.Itoa(procs), func(t *testing.T) {
1395+
want := max(4, min(procs, 32))
1396+
runTest(t, procs, want)
1397+
})
1398+
}
1399+
})
1400+
1401+
t.Run("Darwin_ARM64", func(t *testing.T) {
1402+
if runtime.GOOS != "darwin" && runtime.GOARCH != "arm64" {
1403+
t.Skip("test only supported on darwin/arm64")
1404+
}
1405+
// Not all platforms have syscall.SysctlUint32 so shell out to
1406+
// "sysctl". This is ugly but saves us from having to duplicate
1407+
// a large portion of this code.
1408+
out, err := exec.Command("sysctl", "hw.perflevel0.physicalcpu").CombinedOutput()
1409+
if err != nil {
1410+
t.Fatal(err)
1411+
}
1412+
_, val, ok := strings.Cut(strings.TrimSpace(string(out)), " ")
1413+
if !ok {
1414+
t.Fatalf("Invalid sysctl output: %q", out)
1415+
}
1416+
want, err := strconv.Atoi(val)
1417+
if err != nil {
1418+
t.Fatal(err)
1419+
}
1420+
want = max(want, 4) // 4 is the minimum
1421+
runTest(t, -1, int(want))
1422+
})
1423+
}
1424+
13461425
func BenchmarkSortModeString(b *testing.B) {
13471426
var s string
13481427
for i := 0; i < b.N; i++ {

numcpu_darwin_arm64.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
//go:build darwin && arm64
2+
3+
package fastwalk
4+
5+
import "syscall"
6+
7+
// darwinNumPerfCores returns the number of performance cores currently
8+
// available on macOS/arm64.
9+
func darwinNumPerfCores() int {
10+
// "hw.physicalcpu" is the number of physical processors available in
11+
// the current power management mode.
12+
//
13+
// https://github.com/apple-oss-distributions/xnu/blob/43a90889846e00bfb5cf1d255cdc0a701a1e05a4/bsd/sys/sysctl.h#L1244
14+
//
15+
// NB: We do not cache this value since it could change with the power
16+
// management mode.
17+
//
18+
// TODO: Find someone with a Mac Studio or MacPro to see if this still
19+
// holds true when the CPU is basically two "fused" ones.
20+
n, err := syscall.SysctlUint32("hw.perflevel0.physicalcpu")
21+
if err != nil {
22+
return -1
23+
}
24+
return int(n)
25+
}

numcpu_darwin_arm64_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package fastwalk
2+
3+
import "testing"
4+
5+
func TestDarwinNumPerfCores(t *testing.T) {
6+
n := darwinNumPerfCores()
7+
// Test that n is reasonable
8+
if !(0 < n && n < 4096) {
9+
t.Fatalf("expected a value between 0..4096 got: %d", n)
10+
}
11+
}

numcpu_portable.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
//go:build !(darwin && arm64)
2+
3+
package fastwalk
4+
5+
func darwinNumPerfCores() int {
6+
return -1
7+
}

0 commit comments

Comments
 (0)