Skip to content

Commit d60166d

Browse files
ceseolaboger
authored andcommitted
runtime: improve IndexByte for ppc64x
This change adds a better implementation of IndexByte for ppc64x. Improvement for bytes·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte/10-16 12.5 8.48 -32.16% BenchmarkIndexByte/32-16 34.4 9.85 -71.37% BenchmarkIndexByte/4K-16 3089 217 -92.98% BenchmarkIndexByte/4M-16 3154810 207051 -93.44% BenchmarkIndexByte/64M-16 50564811 5579093 -88.97% benchmark old MB/s new MB/s speedup BenchmarkIndexByte/10-16 800.41 1179.64 1.47x BenchmarkIndexByte/32-16 930.60 3249.10 3.49x BenchmarkIndexByte/4K-16 1325.71 18832.53 14.21x BenchmarkIndexByte/4M-16 1329.49 20257.29 15.24x BenchmarkIndexByte/64M-16 1327.19 12028.63 9.06x Improvement for strings·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte-16 25.9 7.69 -70.31% Fixes #19030 Change-Id: Ifb82bbb3d643ec44b98eaa2d08a07f47e5c2fd11 Reviewed-on: https://go-review.googlesource.com/37670 Run-TryBot: Lynn Boger <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Lynn Boger <[email protected]>
1 parent d5dc490 commit d60166d

File tree

1 file changed

+165
-35
lines changed

1 file changed

+165
-35
lines changed

src/runtime/asm_ppc64x.s

Lines changed: 165 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,53 +1113,183 @@ equal:
11131113
MOVBZ R3,ret+48(FP)
11141114
RET
11151115

1116-
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
1117-
MOVD s+0(FP), R3
1118-
MOVD s_len+8(FP), R4
1119-
MOVBZ c+24(FP), R5 // byte to find
1120-
MOVD R3, R6 // store base for later
1121-
SUB $1, R3
1122-
ADD R3, R4 // end-1
1116+
TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
1117+
MOVD s+0(FP), R3 // R3 = byte array pointer
1118+
MOVD s_len+8(FP), R4 // R4 = length
1119+
MOVBZ c+24(FP), R5 // R5 = byte
1120+
MOVD $ret+32(FP), R14 // R14 = &ret
1121+
BR runtime·indexbytebody<>(SB)
1122+
1123+
TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
1124+
MOVD s+0(FP), R3 // R3 = string
1125+
MOVD s_len+8(FP), R4 // R4 = length
1126+
MOVBZ c+16(FP), R5 // R5 = byte
1127+
MOVD $ret+24(FP), R14 // R14 = &ret
1128+
BR runtime·indexbytebody<>(SB)
1129+
1130+
TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
1131+
DCBT (R3) // Prepare cache line.
1132+
MOVD R3,R10 // Save base address for calculating the index later.
1133+
RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
1134+
RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
1135+
1136+
// Calculate last acceptable address and check for possible overflow
1137+
// using a saturated add.
1138+
// Overflows set last acceptable address to 0xffffffffffffffff.
1139+
ADD R4,R3,R7
1140+
SUBC R3,R7,R6
1141+
SUBE R0,R0,R9
1142+
MOVW R9,R6
1143+
OR R6,R7,R7
1144+
1145+
RLDIMI $16,R5,$32,R5
1146+
CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently.
1147+
MOVD $-1,R9
1148+
WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
1149+
RLDIMI $32,R5,$0,R5
1150+
ADD $-1,R7,R7
1151+
#ifdef GOARCH_ppc64le
1152+
SLD R6,R9,R9 // Prepare mask for Little Endian
1153+
#else
1154+
SRD R6,R9,R9 // Same for Big Endian
1155+
#endif
1156+
BLE small_string // Jump to the small string case if it's <32 bytes.
1157+
1158+
// Case for length >32 bytes
1159+
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
1160+
CMPB R12,R5,R3 // Check for a match.
1161+
AND R9,R3,R3 // Mask bytes below s_base
1162+
RLDICL $0,R7,$61,R4 // length-1
1163+
RLDICR $0,R7,$60,R7 // Last doubleword in R7
1164+
CMPU R3,$0,CR7 // If we have a match, jump to the final computation
1165+
BNE CR7,done
1166+
1167+
// Check for doubleword alignment and jump to the loop setup if aligned.
1168+
MOVFL R8,CR7
1169+
BC 12,28,loop_setup
1170+
1171+
// Not aligned, so handle the second doubleword
1172+
MOVDU 8(R8),R12
1173+
CMPB R12,R5,R3
1174+
CMPU R3,$0,CR7
1175+
BNE CR7,done
1176+
1177+
loop_setup:
1178+
// We are now aligned to a 16-byte boundary. We will load two doublewords
1179+
// per loop iteration. The last doubleword is in R7, so our loop counter
1180+
// starts at (R7-R8)/16.
1181+
SUB R8,R7,R6
1182+
SRD $4,R6,R6
1183+
MOVD R6,CTR
11231184

1185+
// Note: when we have an align directive, align this loop to 32 bytes so
1186+
// it fits in a single icache sector.
11241187
loop:
1125-
CMP R3, R4
1188+
// Load two doublewords, then compare and merge in a single register. We
1189+
// will check two doublewords per iteration, then find out which of them
1190+
// contains the byte later. This speeds up the search.
1191+
MOVD 8(R8),R12
1192+
MOVDU 16(R8),R11
1193+
CMPB R12,R5,R3
1194+
CMPB R11,R5,R9
1195+
OR R3,R9,R6
1196+
CMPU R6,$0,CR7
1197+
BNE CR7,found
1198+
BC 16,0,loop
1199+
1200+
// Counter zeroed, but we may have another doubleword to read
1201+
CMPU R8,R7
11261202
BEQ notfound
1127-
MOVBZU 1(R3), R7
1128-
CMP R7, R5
1129-
BNE loop
11301203

1131-
SUB R6, R3 // remove base
1132-
MOVD R3, ret+32(FP)
1133-
RET
1204+
MOVDU 8(R8),R12
1205+
CMPB R12,R5,R3
1206+
CMPU R3,$0,CR6
1207+
BNE CR6,done
11341208

11351209
notfound:
1136-
MOVD $-1, R3
1137-
MOVD R3, ret+32(FP)
1210+
MOVD $-1,R3
1211+
MOVD R3,(R14)
11381212
RET
11391213

1140-
TEXT strings·IndexByte(SB),NOSPLIT,$0-32
1141-
MOVD p+0(FP), R3
1142-
MOVD b_len+8(FP), R4
1143-
MOVBZ c+16(FP), R5 // byte to find
1144-
MOVD R3, R6 // store base for later
1145-
SUB $1, R3
1146-
ADD R3, R4 // end-1
1214+
found:
1215+
// One of the doublewords from the loop contains the byte we are looking
1216+
// for. Check the first doubleword and adjust the address if found.
1217+
CMPU R3,$0,CR6
1218+
ADD $-8,R8,R8
1219+
BNE CR6,done
1220+
1221+
// Not found, so it must be in the second doubleword of the merged pair.
1222+
MOVD R9,R3
1223+
ADD $8,R8,R8
1224+
1225+
done:
1226+
// At this point, R3 has 0xFF in the same position as the byte we are
1227+
// looking for in the doubleword. Use that to calculate the exact index
1228+
// of the byte.
1229+
#ifdef GOARCH_ppc64le
1230+
ADD $-1,R3,R11
1231+
ANDN R3,R11,R11
1232+
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
1233+
#else
1234+
CNTLZD R3,R11 // Count leading zeros (Big Endian).
1235+
#endif
1236+
CMPU R8,R7 // Check if we are at the last doubleword.
1237+
SRD $3,R11 // Convert trailing zeros to bytes.
1238+
ADD R11,R8,R3
1239+
CMPU R11,R4,CR7 // If at the last doubleword, check the byte offset.
1240+
BNE return
1241+
BLE CR7,return
1242+
MOVD $-1,R3
1243+
MOVD R3,(R14)
1244+
RET
11471245

1148-
loop:
1149-
CMP R3, R4
1246+
return:
1247+
SUB R10,R3 // Calculate index.
1248+
MOVD R3,(R14)
1249+
RET
1250+
1251+
small_string:
1252+
// We unroll this loop for better performance.
1253+
CMPU R4,$0 // Check for length=0
11501254
BEQ notfound
1151-
MOVBZU 1(R3), R7
1152-
CMP R7, R5
1153-
BNE loop
11541255

1155-
SUB R6, R3 // remove base
1156-
MOVD R3, ret+24(FP)
1157-
RET
1256+
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
1257+
CMPB R12,R5,R3 // Check for a match.
1258+
AND R9,R3,R3 // Mask bytes below s_base.
1259+
CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
1260+
RLDICL $0,R7,$61,R4 // length-1
1261+
RLDICR $0,R7,$60,R7 // Last doubleword in R7.
1262+
CMPU R8,R7
1263+
BNE CR7,done
1264+
BEQ notfound // Hit length.
1265+
1266+
MOVDU 8(R8),R12
1267+
CMPB R12,R5,R3
1268+
CMPU R3,$0,CR6
1269+
CMPU R8,R7
1270+
BNE CR6,done
1271+
BEQ notfound
11581272

1159-
notfound:
1160-
MOVD $-1, R3
1161-
MOVD R3, ret+24(FP)
1162-
RET
1273+
MOVDU 8(R8),R12
1274+
CMPB R12,R5,R3
1275+
CMPU R3,$0,CR6
1276+
CMPU R8,R7
1277+
BNE CR6,done
1278+
BEQ notfound
1279+
1280+
MOVDU 8(R8),R12
1281+
CMPB R12,R5,R3
1282+
CMPU R3,$0,CR6
1283+
CMPU R8,R7
1284+
BNE CR6,done
1285+
BEQ notfound
1286+
1287+
MOVDU 8(R8),R12
1288+
CMPB R12,R5,R3
1289+
CMPU R3,$0,CR6
1290+
CMPU R8,R7
1291+
BNE CR6,done
1292+
BR notfound
11631293

11641294
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
11651295
MOVD s1_base+0(FP), R5

0 commit comments

Comments
 (0)