Skip to content

Commit ef0dedc

Browse files
doujiang24cherrymui
authored andcommitted
runtime/cgo: store M for C-created thread in pthread key
In a C thread, it's necessary to acquire an extra M by using needm while invoking a Go function from C. But, needm and dropm are heavy costs due to the signal-related syscalls. So, we change to not dropm while returning back to C, which means binding the extra M to the C thread until it exits, to avoid needm and dropm on each C to Go call. Instead, we only dropm while the C thread exits, so the extra M won't leak. When invoking a Go function from C: Allocate a pthread variable using pthread_key_create, only once per shared object, and register a thread-exit-time destructor. And store the g0 of the current m into the thread-specified value of the pthread key, only once per C thread, so that the destructor will put the extra M back onto the extra M list while the C thread exits. When returning back to C: Skip dropm in cgocallback, when the pthread variable has been created, so that the extra M will be reused the next time invoke a Go function from C. This is purely a performance optimization. The old version, in which needm & dropm happen on each cgo call, is still correct too, and we have to keep the old version on systems with cgo but without pthreads, like Windows. This optimization is significant, and the specific value depends on the OS system and CPU, but in general, it can be considered as 10x faster, for a simple Go function call from a C thread. For the newly added BenchmarkCGoInCThread, some benchmark results: 1. it's 28x faster, from 3395 ns/op to 121 ns/op, in darwin OS & Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz 2. it's 6.5x faster, from 1495 ns/op to 230 ns/op, in Linux OS & Intel(R) Xeon(R) CPU E5-2630 0 @ 2.30GHz Fixes #51676 Change-Id: I380702fe2f9b6b401b2d6f04b0aba990f4b9ee6c GitHub-Last-Rev: 93dc64a GitHub-Pull-Request: #51679 Reviewed-on: https://go-review.googlesource.com/c/go/+/392854 Reviewed-by: Ian Lance Taylor <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Run-TryBot: thepudds <[email protected]> Reviewed-by: Cherry Mui <[email protected]>
1 parent a6c382e commit ef0dedc

37 files changed

+760
-61
lines changed

misc/cgo/test/cgo_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ func TestThreadLock(t *testing.T) { testThreadLockFunc(t) }
104104
func TestUnsignedInt(t *testing.T) { testUnsignedInt(t) }
105105
func TestZeroArgCallback(t *testing.T) { testZeroArgCallback(t) }
106106

107-
func BenchmarkCgoCall(b *testing.B) { benchCgoCall(b) }
108-
func BenchmarkGoString(b *testing.B) { benchGoString(b) }
109-
func BenchmarkCGoCallback(b *testing.B) { benchCallback(b) }
107+
func BenchmarkCgoCall(b *testing.B) { benchCgoCall(b) }
108+
func BenchmarkGoString(b *testing.B) { benchGoString(b) }
109+
func BenchmarkCGoCallback(b *testing.B) { benchCallback(b) }
110+
func BenchmarkCGoInCThread(b *testing.B) { benchCGoInCthread(b) }

misc/cgo/test/cthread_unix.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,27 @@ doAdd(int max, int nthread)
3232
for(i=0; i<nthread; i++)
3333
pthread_join(thread_id[i], 0);
3434
}
35+
36+
static void*
37+
goDummyCallbackThread(void* p)
38+
{
39+
int i, max;
40+
41+
max = *(int*)p;
42+
for(i=0; i<max; i++)
43+
goDummy();
44+
return NULL;
45+
}
46+
47+
int
48+
callGoInCThread(int max)
49+
{
50+
pthread_t thread;
51+
52+
if (pthread_create(&thread, NULL, goDummyCallbackThread, (void*)(&max)) != 0)
53+
return -1;
54+
if (pthread_join(thread, NULL) != 0)
55+
return -1;
56+
57+
return max;
58+
}

misc/cgo/test/cthread_windows.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,25 @@ doAdd(int max, int nthread)
3535
CloseHandle((HANDLE)thread_id[i]);
3636
}
3737
}
38+
39+
__stdcall
40+
static unsigned int
41+
goDummyCallbackThread(void* p)
42+
{
43+
int i, max;
44+
45+
max = *(int*)p;
46+
for(i=0; i<max; i++)
47+
goDummy();
48+
return 0;
49+
}
50+
51+
int
52+
callGoInCThread(int max)
53+
{
54+
uintptr_t thread_id;
55+
thread_id = _beginthreadex(0, 0, goDummyCallbackThread, &max, 0, 0);
56+
WaitForSingleObject((HANDLE)thread_id, INFINITE);
57+
CloseHandle((HANDLE)thread_id);
58+
return max;
59+
}

misc/cgo/test/testx.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
/*
2525
// threads
2626
extern void doAdd(int, int);
27+
extern int callGoInCThread(int);
2728
2829
// issue 1328
2930
void IntoC(void);
@@ -146,6 +147,10 @@ func Add(x int) {
146147
*p = 2
147148
}
148149

150+
//export goDummy
151+
func goDummy() {
152+
}
153+
149154
func testCthread(t *testing.T) {
150155
if (runtime.GOOS == "darwin" || runtime.GOOS == "ios") && runtime.GOARCH == "arm64" {
151156
t.Skip("the iOS exec wrapper is unable to properly handle the panic from Add")
@@ -159,6 +164,15 @@ func testCthread(t *testing.T) {
159164
}
160165
}
161166

167+
// Benchmark measuring overhead from C to Go in a C thread.
168+
// Create a new C thread and invoke Go function repeatedly in the new C thread.
169+
func benchCGoInCthread(b *testing.B) {
170+
n := C.callGoInCThread(C.int(b.N))
171+
if int(n) != b.N {
172+
b.Fatal("unmatch loop times")
173+
}
174+
}
175+
162176
// issue 1328
163177

164178
//export BackIntoGo

src/runtime/asm_386.s

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -689,7 +689,20 @@ nosave:
689689
TEXT ·cgocallback(SB),NOSPLIT,$12-12 // Frame size must match commented places below
690690
NO_LOCAL_POINTERS
691691

692-
// If g is nil, Go did not create the current thread.
692+
// Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
693+
// It is used to dropm while thread is exiting.
694+
MOVL fn+0(FP), AX
695+
CMPL AX, $0
696+
JNE loadg
697+
// Restore the g from frame.
698+
get_tls(CX)
699+
MOVL frame+4(FP), BX
700+
MOVL BX, g(CX)
701+
JMP dropm
702+
703+
loadg:
704+
// If g is nil, Go did not create the current thread,
705+
// or if this thread never called into Go on pthread platforms.
693706
// Call needm to obtain one for temporary use.
694707
// In this case, we're running on the thread stack, so there's
695708
// lots of space, but the linker doesn't know. Hide the call from
@@ -707,9 +720,9 @@ TEXT ·cgocallback(SB),NOSPLIT,$12-12 // Frame size must match commented places
707720
MOVL BP, savedm-4(SP) // saved copy of oldm
708721
JMP havem
709722
needm:
710-
MOVL $runtime·needm(SB), AX
723+
MOVL $runtime·needAndBindM(SB), AX
711724
CALL AX
712-
MOVL $0, savedm-4(SP) // dropm on return
725+
MOVL $0, savedm-4(SP)
713726
get_tls(CX)
714727
MOVL g(CX), BP
715728
MOVL g_m(BP), BP
@@ -784,13 +797,29 @@ havem:
784797
MOVL 0(SP), AX
785798
MOVL AX, (g_sched+gobuf_sp)(SI)
786799

787-
// If the m on entry was nil, we called needm above to borrow an m
788-
// for the duration of the call. Since the call is over, return it with dropm.
800+
// If the m on entry was nil, we called needm above to borrow an m,
801+
// 1. for the duration of the call on non-pthread platforms,
802+
// 2. or the duration of the C thread alive on pthread platforms.
803+
// If the m on entry wasn't nil,
804+
// 1. the thread might be a Go thread,
805+
// 2. or it's wasn't the first call from a C thread on pthread platforms,
806+
// since the we skip dropm to resue the m in the first call.
789807
MOVL savedm-4(SP), DX
790808
CMPL DX, $0
791-
JNE 3(PC)
809+
JNE droppedm
810+
811+
// Skip dropm to reuse it in the next call, when a pthread key has been created.
812+
MOVL _cgo_pthread_key_created(SB), DX
813+
// It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
814+
CMPL DX, $0
815+
JEQ dropm
816+
CMPL (DX), $0
817+
JNE droppedm
818+
819+
dropm:
792820
MOVL $runtime·dropm(SB), AX
793821
CALL AX
822+
droppedm:
794823

795824
// Done!
796825
RET

src/runtime/asm_amd64.s

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -915,7 +915,20 @@ GLOBL zeroTLS<>(SB),RODATA,$const_tlsSize
915915
TEXT ·cgocallback(SB),NOSPLIT,$24-24
916916
NO_LOCAL_POINTERS
917917

918-
// If g is nil, Go did not create the current thread.
918+
// Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
919+
// It is used to dropm while thread is exiting.
920+
MOVQ fn+0(FP), AX
921+
CMPQ AX, $0
922+
JNE loadg
923+
// Restore the g from frame.
924+
get_tls(CX)
925+
MOVQ frame+8(FP), BX
926+
MOVQ BX, g(CX)
927+
JMP dropm
928+
929+
loadg:
930+
// If g is nil, Go did not create the current thread,
931+
// or if this thread never called into Go on pthread platforms.
919932
// Call needm to obtain one m for temporary use.
920933
// In this case, we're running on the thread stack, so there's
921934
// lots of space, but the linker doesn't know. Hide the call from
@@ -953,9 +966,9 @@ needm:
953966
// a bad value in there, in case needm tries to use it.
954967
XORPS X15, X15
955968
XORQ R14, R14
956-
MOVQ $runtime·needm<ABIInternal>(SB), AX
969+
MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX
957970
CALL AX
958-
MOVQ $0, savedm-8(SP) // dropm on return
971+
MOVQ $0, savedm-8(SP)
959972
get_tls(CX)
960973
MOVQ g(CX), BX
961974
MOVQ g_m(BX), BX
@@ -1044,11 +1057,26 @@ havem:
10441057
MOVQ 0(SP), AX
10451058
MOVQ AX, (g_sched+gobuf_sp)(SI)
10461059

1047-
// If the m on entry was nil, we called needm above to borrow an m
1048-
// for the duration of the call. Since the call is over, return it with dropm.
1060+
// If the m on entry was nil, we called needm above to borrow an m,
1061+
// 1. for the duration of the call on non-pthread platforms,
1062+
// 2. or the duration of the C thread alive on pthread platforms.
1063+
// If the m on entry wasn't nil,
1064+
// 1. the thread might be a Go thread,
1065+
// 2. or it's wasn't the first call from a C thread on pthread platforms,
1066+
// since the we skip dropm to resue the m in the first call.
10491067
MOVQ savedm-8(SP), BX
10501068
CMPQ BX, $0
10511069
JNE done
1070+
1071+
// Skip dropm to reuse it in the next call, when a pthread key has been created.
1072+
MOVQ _cgo_pthread_key_created(SB), AX
1073+
// It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
1074+
CMPQ AX, $0
1075+
JEQ dropm
1076+
CMPQ (AX), $0
1077+
JNE done
1078+
1079+
dropm:
10521080
MOVQ $runtime·dropm(SB), AX
10531081
CALL AX
10541082
#ifdef GOOS_windows

src/runtime/asm_arm.s

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,15 @@ nosave:
630630
TEXT ·cgocallback(SB),NOSPLIT,$12-12
631631
NO_LOCAL_POINTERS
632632

633+
// Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
634+
// It is used to dropm while thread is exiting.
635+
MOVW fn+0(FP), R1
636+
B.NE loadg
637+
// Restore the g from frame.
638+
MOVW frame+4(FP), g
639+
B dropm
640+
641+
loadg:
633642
// Load m and g from thread-local storage.
634643
#ifdef GOOS_openbsd
635644
BL runtime·load_g(SB)
@@ -639,7 +648,8 @@ TEXT ·cgocallback(SB),NOSPLIT,$12-12
639648
BL.NE runtime·load_g(SB)
640649
#endif
641650

642-
// If g is nil, Go did not create the current thread.
651+
// If g is nil, Go did not create the current thread,
652+
// or if this thread never called into Go on pthread platforms.
643653
// Call needm to obtain one for temporary use.
644654
// In this case, we're running on the thread stack, so there's
645655
// lots of space, but the linker doesn't know. Hide the call from
@@ -653,7 +663,7 @@ TEXT ·cgocallback(SB),NOSPLIT,$12-12
653663

654664
needm:
655665
MOVW g, savedm-4(SP) // g is zero, so is m.
656-
MOVW $runtime·needm(SB), R0
666+
MOVW $runtime·needAndBindM(SB), R0
657667
BL (R0)
658668

659669
// Set m->g0->sched.sp = SP, so that if a panic happens
@@ -724,14 +734,31 @@ havem:
724734
MOVW savedsp-12(SP), R4 // must match frame size
725735
MOVW R4, (g_sched+gobuf_sp)(g)
726736

727-
// If the m on entry was nil, we called needm above to borrow an m
728-
// for the duration of the call. Since the call is over, return it with dropm.
737+
// If the m on entry was nil, we called needm above to borrow an m,
738+
// 1. for the duration of the call on non-pthread platforms,
739+
// 2. or the duration of the C thread alive on pthread platforms.
740+
// If the m on entry wasn't nil,
741+
// 1. the thread might be a Go thread,
742+
// 2. or it's wasn't the first call from a C thread on pthread platforms,
743+
// since the we skip dropm to resue the m in the first call.
729744
MOVW savedm-4(SP), R6
730745
CMP $0, R6
731-
B.NE 3(PC)
746+
B.NE done
747+
748+
// Skip dropm to reuse it in the next call, when a pthread key has been created.
749+
MOVW _cgo_pthread_key_created(SB), R6
750+
// It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
751+
CMP $0, R6
752+
B.EQ dropm
753+
MOVW (R6), R6
754+
CMP $0, R6
755+
B.NE done
756+
757+
dropm:
732758
MOVW $runtime·dropm(SB), R0
733759
BL (R0)
734760

761+
done:
735762
// Done!
736763
RET
737764

src/runtime/asm_arm64.s

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,10 +1014,20 @@ nosave:
10141014
TEXT ·cgocallback(SB),NOSPLIT,$24-24
10151015
NO_LOCAL_POINTERS
10161016

1017+
// Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
1018+
// It is used to dropm while thread is exiting.
1019+
MOVD fn+0(FP), R1
1020+
CBNZ R1, loadg
1021+
// Restore the g from frame.
1022+
MOVD frame+8(FP), g
1023+
B dropm
1024+
1025+
loadg:
10171026
// Load g from thread-local storage.
10181027
BL runtime·load_g(SB)
10191028

1020-
// If g is nil, Go did not create the current thread.
1029+
// If g is nil, Go did not create the current thread,
1030+
// or if this thread never called into Go on pthread platforms.
10211031
// Call needm to obtain one for temporary use.
10221032
// In this case, we're running on the thread stack, so there's
10231033
// lots of space, but the linker doesn't know. Hide the call from
@@ -1030,7 +1040,7 @@ TEXT ·cgocallback(SB),NOSPLIT,$24-24
10301040

10311041
needm:
10321042
MOVD g, savedm-8(SP) // g is zero, so is m.
1033-
MOVD $runtime·needm(SB), R0
1043+
MOVD $runtime·needAndBindM(SB), R0
10341044
BL (R0)
10351045

10361046
// Set m->g0->sched.sp = SP, so that if a panic happens
@@ -1111,10 +1121,24 @@ havem:
11111121
MOVD savedsp-16(SP), R4
11121122
MOVD R4, (g_sched+gobuf_sp)(g)
11131123

1114-
// If the m on entry was nil, we called needm above to borrow an m
1115-
// for the duration of the call. Since the call is over, return it with dropm.
1124+
// If the m on entry was nil, we called needm above to borrow an m,
1125+
// 1. for the duration of the call on non-pthread platforms,
1126+
// 2. or the duration of the C thread alive on pthread platforms.
1127+
// If the m on entry wasn't nil,
1128+
// 1. the thread might be a Go thread,
1129+
// 2. or it's wasn't the first call from a C thread on pthread platforms,
1130+
// since the we skip dropm to resue the m in the first call.
11161131
MOVD savedm-8(SP), R6
11171132
CBNZ R6, droppedm
1133+
1134+
// Skip dropm to reuse it in the next call, when a pthread key has been created.
1135+
MOVD _cgo_pthread_key_created(SB), R6
1136+
// It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
1137+
CBZ R6, dropm
1138+
MOVD (R6), R6
1139+
CBNZ R6, droppedm
1140+
1141+
dropm:
11181142
MOVD $runtime·dropm(SB), R0
11191143
BL (R0)
11201144
droppedm:

0 commit comments

Comments
 (0)