esp32: implement task based scheduler

aykevl · deadprogram · commit caf35cfc4137 · 2020-12-05T09:02:11.000+01:00
This has been a *lot* of work, trying to understand the Xtensa windowed registers ABI. But in the end I managed to come up with a very simple implementation that so far seems to work very well. I tested this with both blinky examples (with blinky2 slightly edited) and ./testdata/coroutines.go to verify that it actually works. Most development happened on the ESP32 QEMU fork from Espressif (https://github.com/espressif/qemu/wiki) but I also verified that it works on a real ESP32.
diff --git a/src/device/esp/esp32.S b/src/device/esp/esp32.S
@@ -24,7 +24,7 @@ call_start_cpu0:
     wsr.ps a2
     rsync
 
-    // Set WINDOWBASE to 1 << WINDOWSTART.
+    // Set WINDOWSTART to 1 << WINDOWBASE.
     rsr.windowbase  a2
     ssl  a2
     movi a2, 1
@@ -43,7 +43,7 @@ call_start_cpu0:
     rsync
 
     // Jump to the runtime start function written in Go.
-    j main
+    call4 main
 
 .section .text.tinygo_scanCurrentStack
 .global tinygo_scanCurrentStack
diff --git a/src/internal/task/task_stack_esp32.S b/src/internal/task/task_stack_esp32.S
@@ -0,0 +1,86 @@
+.section .text.tinygo_startTask,"ax",@progbits
+.global  tinygo_startTask
+.type    tinygo_startTask, %function
+tinygo_startTask:
+    // Small assembly stub for starting a goroutine. This already runs on the
+    // new stack, control reaches this function after returning from the initial
+    // tinygo_swapTask below (the retw.n instruction).
+    //
+    // The stack was set up in such a way that it looks as if this function was
+    // paused using tinygo_swapTask by setting up the parent register window and
+    // return pointer as a call4 instruction - except such a call never took
+    // place. Instead, the stack pointer is switched to the new stack after all
+    // live-but-invisible registers have been flushed to the stack. This means
+    // that all registers as present in tinygo_swapTask are moved four up (a2 in
+    // tinygo_swapTask is a6 in this function). We don't use any of those
+    // registers however. Instead, the retw.n instruction will load them through
+    // an underflow exception from the stack which means we get a0-a3 as defined
+    // in task_stack_esp32.go.
+
+    // Branch to the "goroutine start" function. The first (and only) parameter
+    // is stored in a2, but has to be moved to a6 to make it appear as a2 in the
+    // goroutine start function (due to changing the register window by four
+    // with callx4).
+    mov.n a6, a2
+    callx4 a3
+
+    // After return, exit this goroutine. This call never returns.
+    call4  tinygo_pause
+
+.section .text.tinygo_swapTask,"ax",@progbits
+.global tinygo_swapTask
+.type tinygo_swapTask, %function
+tinygo_swapTask:
+    // This function gets the following parameters:
+    // a2 = newStack uintptr
+    // a3 = oldStack *uintptr
+
+    // Reserve 32 bytes on the stack. It really needs to be 32 bytes, with 16
+    // extra at the bottom to adhere to the ABI.
+    entry sp, 32
+
+    // Disable interrupts while flushing registers. This is necessary because
+    // interrupts might want to use the stack pointer (at a2) which will be some
+    // arbitrary register while registers are flushed.
+    rsil a4, 3 // XCHAL_EXCM_LEVEL
+
+    // Flush all unsaved registers to the stack.
+    // This trick has been borrowed from the Zephyr project:
+    // https://github.com/zephyrproject-rtos/zephyr/blob/d79b003758/arch/xtensa/include/xtensa-asm2-s.h#L17
+    and a12, a12, a12
+    rotw 3
+    and a12, a12, a12
+    rotw 3
+    and a12, a12, a12
+    rotw 3
+    and a12, a12, a12
+    rotw 3
+    and a12, a12, a12
+    rotw 4
+
+    // Restore interrupts.
+    wsr.ps a4
+
+    // At this point, the following is true:
+    //     WindowStart == 1 << WindowBase
+    // Therefore, we don't need to do this manually.
+    // It also means that the stack pointer can now be safely modified.
+
+    // Save a0, which stores the return address and the parent register window
+    // in the upper two bits.
+    s32i.n a0, sp, 0
+
+    // Save the current stack pointer in oldStack.
+    s32i.n  sp, a3, 0
+
+    // Switch to the new stack pointer (newStack).
+    mov.n   sp, a2
+
+    // Load a0, which is the previous return addres from before the previous
+    // switch or the constructed return address to tinygo_startTask. This
+    // register also stores the parent register window.
+    l32i.n a0, sp, 0
+
+    // Return into the new stack. This instruction will trigger a window
+    // underflow, reloading the saved registers from the stack.
+    retw.n
diff --git a/src/internal/task/task_stack_esp32.go b/src/internal/task/task_stack_esp32.go
@@ -0,0 +1,76 @@
+// +build scheduler.tasks,esp32
+
+package task
+
+// The windowed ABI (used on the ESP32) is as follows:
+//   a0:    return address (link register)
+//   a1:    stack pointer (must be 16-byte aligned)
+//   a2-a7: incoming arguments
+//   a7:    stack frame pointer (optional, normally unused in TinyGo)
+// Sources:
+//   http://cholla.mmto.org/esp8266/xtensa.html
+//   https://0x04.net/~mwk/doc/xtensa.pdf
+
+import (
+	"unsafe"
+)
+
+var systemStack uintptr
+
+// calleeSavedRegs is the list of registers that must be saved and restored when
+// switching between tasks. Also see task_stack_esp8266.S that relies on the
+// exact layout of this struct.
+type calleeSavedRegs struct {
+	// Registers in the register window of tinygo_startTask.
+	a0 uintptr
+	a1 uintptr
+	a2 uintptr
+	a3 uintptr
+
+	// Locals that can be used by tinygo_swapTask.
+	// The first field is the a0 loaded in tinygo_swapTask, the rest is unused.
+	locals [4]uintptr
+}
+
+// archInit runs architecture-specific setup for the goroutine startup.
+func (s *state) archInit(r *calleeSavedRegs, fn uintptr, args unsafe.Pointer) {
+	// Store the stack pointer for the tinygo_swapTask function (implemented in
+	// assembly). It needs to point to the locals field instead of a0 so that
+	// the retw.n at the end of tinygo_swapTask will return into
+	// tinygo_startTask with a0-a3 loaded (using the register window mechanism).
+	s.sp = uintptr(unsafe.Pointer(&r.locals[0]))
+
+	// Start the goroutine at tinygo_startTask (defined in
+	// src/internal/task/task_stack_esp32.S). The topmost two bits are not part
+	// of the address but instead store the register window of the caller.
+	// In this case there is no caller, instead we set up the return address as
+	// if tinygo_startTask called tinygo_swapTask with a call4 instruction.
+	r.locals[0] = uintptr(unsafe.Pointer(&startTask))&^(3<<30) | (1 << 30)
+
+	// Set up the stack pointer inside tinygo_startTask.
+	// Unlike most calling conventions, the windowed ABI actually saves the
+	// stack pointer on the stack to make register windowing work.
+	r.a1 = uintptr(unsafe.Pointer(r)) + 32
+
+	// Store the function pointer and the (only) parameter on the stack in a
+	// location that will be reloaded into registers when doing the
+	// pseudo-return to tinygo_startTask using the register window mechanism.
+	r.a3 = fn
+	r.a2 = uintptr(args)
+}
+
+func (s *state) resume() {
+	swapTask(s.sp, &systemStack)
+}
+
+func (s *state) pause() {
+	newStack := systemStack
+	systemStack = 0
+	swapTask(newStack, &s.sp)
+}
+
+// SystemStack returns the system stack pointer when called from a task stack.
+// When called from the system stack, it returns 0.
+func SystemStack() uintptr {
+	return systemStack
+}
diff --git a/src/runtime/arch_xtensa.go b/src/runtime/arch_xtensa.go
@@ -2,6 +2,8 @@
 
 package runtime
 
+import "device"
+
 const GOARCH = "arm" // xtensa pretends to be arm
 
 // The bitness of the CPU (e.g. 8, 32, 64).
@@ -12,4 +14,7 @@ func align(ptr uintptr) uintptr {
 	return (ptr + 3) &^ 3
 }
 
-func getCurrentStackPointer() uintptr
+func getCurrentStackPointer() uintptr {
+	// The stack pointer (sp) is a1.
+	return device.AsmFull("mov {}, sp", nil)
+}
diff --git a/targets/esp32.json b/targets/esp32.json
@@ -2,13 +2,16 @@
 	"inherits": ["xtensa"],
 	"cpu": "esp32",
 	"build-tags": ["esp32", "esp"],
+	"scheduler": "tasks",
 	"linker": "xtensa-esp32-elf-ld",
+	"default-stack-size": 2048,
 	"cflags": [
 		"-mcpu=esp32"
 	],
 	"linkerscript": "targets/esp32.ld",
 	"extra-files": [
-		"src/device/esp/esp32.S"
+		"src/device/esp/esp32.S",
+		"src/internal/task/task_stack_esp32.S"
 	],
 	"binary-format": "esp32",
 	"flash-command": "esptool.py --chip=esp32 --port {port} write_flash 0x1000 {bin} -ff 80m -fm dout"