@@ -1113,53 +1113,183 @@ equal:
1113
1113
MOVBZ R3 , ret + 48 (FP)
1114
1114
RET
1115
1115
1116
- TEXT bytes·IndexByte(SB) , NOSPLIT , $ 0 - 40
1117
- MOVD s + 0 (FP) , R3
1118
- MOVD s_len + 8 (FP) , R4
1119
- MOVBZ c + 24 (FP) , R5 // byte to find
1120
- MOVD R3 , R6 // store base for later
1121
- SUB $ 1 , R3
1122
- ADD R3 , R4 // end - 1
1116
+ TEXT bytes·IndexByte(SB) , NOSPLIT|NOFRAME , $ 0 - 40
1117
+ MOVD s + 0 (FP) , R3 // R3 = byte array pointer
1118
+ MOVD s_len + 8 (FP) , R4 // R4 = length
1119
+ MOVBZ c + 24 (FP) , R5 // R5 = byte
1120
+ MOVD $ ret + 32 (FP) , R14 // R14 = & ret
1121
+ BR runtime·indexbytebody<>(SB)
1122
+
1123
+ TEXT strings·IndexByte(SB) , NOSPLIT|NOFRAME , $ 0 - 32
1124
+ MOVD s + 0 (FP) , R3 // R3 = string
1125
+ MOVD s_len + 8 (FP) , R4 // R4 = length
1126
+ MOVBZ c + 16 (FP) , R5 // R5 = byte
1127
+ MOVD $ ret + 24 (FP) , R14 // R14 = & ret
1128
+ BR runtime·indexbytebody<>(SB)
1129
+
1130
+ TEXT runtime·indexbytebody<>(SB) , NOSPLIT|NOFRAME , $ 0 - 0
1131
+ DCBT (R3) // Prepare cache line.
1132
+ MOVD R3 , R10 // Save base address for calculating the index later.
1133
+ RLDICR $ 0 , R3 , $ 60 , R8 // Align address to doubleword boundary in R8 .
1134
+ RLDIMI $ 8 , R5 , $ 48 , R5 // Replicating the byte across the register.
1135
+
1136
+ // Calculate last acceptable address and check for possible overflow
1137
+ // using a saturated add .
1138
+ // Overflows set last acceptable address to 0xffffffffffffffff .
1139
+ ADD R4 , R3 , R7
1140
+ SUBC R3 , R7 , R6
1141
+ SUBE R0 , R0 , R9
1142
+ MOVW R9 , R6
1143
+ OR R6 , R7 , R7
1144
+
1145
+ RLDIMI $ 16 , R5 , $ 32 , R5
1146
+ CMPU R4 , $ 32 // Check if it's a small string (< 32 bytes). Those will be processed differently.
1147
+ MOVD $ - 1 , R9
1148
+ WORD $ 0x54661EB8 // Calculate padding in R6 (rlwinm r6 , r3 , 3 , 26 , 28 ).
1149
+ RLDIMI $ 32 , R5 , $ 0 , R5
1150
+ ADD $ - 1 , R7 , R7
1151
+ #ifdef GOARCH_ppc64le
1152
+ SLD R6 , R9 , R9 // Prepare mask for Little Endian
1153
+ #else
1154
+ SRD R6 , R9 , R9 // Same for Big Endian
1155
+ #endif
1156
+ BLE small_string // Jump to the small string case if it's < 32 bytes.
1157
+
1158
+ // Case for length > 32 bytes
1159
+ MOVD 0 ( R8 ) , R12 // Load one doubleword from the aligned address in R8 .
1160
+ CMPB R12 , R5 , R3 // Check for a match.
1161
+ AND R9 , R3 , R3 // Mask bytes below s_base
1162
+ RLDICL $ 0 , R7 , $ 61 , R4 // length - 1
1163
+ RLDICR $ 0 , R7 , $ 60 , R7 // Last doubleword in R7
1164
+ CMPU R3 , $ 0 , CR7 // If we have a match , jump to the final computation
1165
+ BNE CR7 , done
1166
+
1167
+ // Check for doubleword alignment and jump to the loop setup if aligned.
1168
+ MOVFL R8 , CR7
1169
+ BC 12 , 28 , loop_setup
1170
+
1171
+ // Not aligned , so handle the second doubleword
1172
+ MOVDU 8 ( R8 ) , R12
1173
+ CMPB R12 , R5 , R3
1174
+ CMPU R3 , $ 0 , CR7
1175
+ BNE CR7 , done
1176
+
1177
+ loop_setup:
1178
+ // We are now aligned to a 16 - byte boundary. We will load two doublewords
1179
+ // per loop iteration. The last doubleword is in R7 , so our loop counter
1180
+ // starts at (R7 - R8 )/ 16 .
1181
+ SUB R8 , R7 , R6
1182
+ SRD $ 4 , R6 , R6
1183
+ MOVD R6 , CTR
1123
1184
1185
+ // Note: when we have an align directive , align this loop to 32 bytes so
1186
+ // it fits in a single icache sector.
1124
1187
loop :
1125
- CMP R3 , R4
1188
+ // Load two doublewords , then compare and merge in a single register. We
1189
+ // will check two doublewords per iteration , then find out which of them
1190
+ // contains the byte later. This speeds up the search.
1191
+ MOVD 8 ( R8 ) , R12
1192
+ MOVDU 16 ( R8 ) , R11
1193
+ CMPB R12 , R5 , R3
1194
+ CMPB R11 , R5 , R9
1195
+ OR R3 , R9 , R6
1196
+ CMPU R6 , $ 0 , CR7
1197
+ BNE CR7 , found
1198
+ BC 16 , 0 , loop
1199
+
1200
+ // Counter zeroed , but we may have another doubleword to read
1201
+ CMPU R8 , R7
1126
1202
BEQ notfound
1127
- MOVBZU 1 (R3) , R7
1128
- CMP R7 , R5
1129
- BNE loop
1130
1203
1131
- SUB R6 , R3 // remove base
1132
- MOVD R3 , ret + 32 (FP)
1133
- RET
1204
+ MOVDU 8 ( R8 ) , R12
1205
+ CMPB R12 , R5 , R3
1206
+ CMPU R3 , $ 0 , CR6
1207
+ BNE CR6 , done
1134
1208
1135
1209
notfound:
1136
- MOVD $ - 1 , R3
1137
- MOVD R3 , ret + 32 (FP )
1210
+ MOVD $ - 1 , R3
1211
+ MOVD R3 , ( R14 )
1138
1212
RET
1139
1213
1140
- TEXT strings·IndexByte(SB) , NOSPLIT , $ 0 - 32
1141
- MOVD p + 0 (FP) , R3
1142
- MOVD b_len + 8 (FP) , R4
1143
- MOVBZ c + 16 (FP) , R5 // byte to find
1144
- MOVD R3 , R6 // store base for later
1145
- SUB $ 1 , R3
1146
- ADD R3 , R4 // end - 1
1214
+ found:
1215
+ // One of the doublewords from the loop contains the byte we are looking
1216
+ // for. Check the first doubleword and adjust the address if found.
1217
+ CMPU R3 , $ 0 , CR6
1218
+ ADD $ - 8 , R8 , R8
1219
+ BNE CR6 , done
1220
+
1221
+ // Not found , so it must be in the second doubleword of the merged pair.
1222
+ MOVD R9 , R3
1223
+ ADD $ 8 , R8 , R8
1224
+
1225
+ done:
1226
+ // At this point , R3 has 0xFF in the same position as the byte we are
1227
+ // looking for in the doubleword. Use th at to calculate the exact index
1228
+ // of the byte.
1229
+ #ifdef GOARCH_ppc64le
1230
+ ADD $ - 1 , R3 , R11
1231
+ ANDN R3 , R11 , R11
1232
+ POPCNTD R11 , R11 // Count trailing zeros (Little Endian).
1233
+ #else
1234
+ CNTLZD R3 , R11 // Count leading zeros (Big Endian).
1235
+ #endif
1236
+ CMPU R8 , R7 // Check if we are at the last doubleword.
1237
+ SRD $ 3 , R11 // Convert trailing zeros to bytes.
1238
+ ADD R11 , R8 , R3
1239
+ CMPU R11 , R4 , CR7 // If at the last doubleword , check the byte offset.
1240
+ BNE return
1241
+ BLE CR7 , return
1242
+ MOVD $ - 1 , R3
1243
+ MOVD R3 , ( R14 )
1244
+ RET
1147
1245
1148
- loop :
1149
- CMP R3 , R4
1246
+ return:
1247
+ SUB R10 , R3 // Calculate index.
1248
+ MOVD R3 , ( R14 )
1249
+ RET
1250
+
1251
+ small_string:
1252
+ // We unroll this loop for better performance.
1253
+ CMPU R4 , $ 0 // Check for length= 0
1150
1254
BEQ notfound
1151
- MOVBZU 1 (R3) , R7
1152
- CMP R7 , R5
1153
- BNE loop
1154
1255
1155
- SUB R6 , R3 // remove base
1156
- MOVD R3 , ret + 24 (FP)
1157
- RET
1256
+ MOVD 0 ( R8 ) , R12 // Load one doubleword from the aligned address in R8 .
1257
+ CMPB R12 , R5 , R3 // Check for a match.
1258
+ AND R9 , R3 , R3 // Mask bytes below s_base.
1259
+ CMPU R3 , $ 0 , CR7 // If we have a match , jump to the final computation.
1260
+ RLDICL $ 0 , R7 , $ 61 , R4 // length - 1
1261
+ RLDICR $ 0 , R7 , $ 60 , R7 // Last doubleword in R7.
1262
+ CMPU R8 , R7
1263
+ BNE CR7 , done
1264
+ BEQ notfound // Hit length.
1265
+
1266
+ MOVDU 8 ( R8 ) , R12
1267
+ CMPB R12 , R5 , R3
1268
+ CMPU R3 , $ 0 , CR6
1269
+ CMPU R8 , R7
1270
+ BNE CR6 , done
1271
+ BEQ notfound
1158
1272
1159
- notfound:
1160
- MOVD $ - 1 , R3
1161
- MOVD R3 , ret + 24 (FP)
1162
- RET
1273
+ MOVDU 8 ( R8 ) , R12
1274
+ CMPB R12 , R5 , R3
1275
+ CMPU R3 , $ 0 , CR6
1276
+ CMPU R8 , R7
1277
+ BNE CR6 , done
1278
+ BEQ notfound
1279
+
1280
+ MOVDU 8 ( R8 ) , R12
1281
+ CMPB R12 , R5 , R3
1282
+ CMPU R3 , $ 0 , CR6
1283
+ CMPU R8 , R7
1284
+ BNE CR6 , done
1285
+ BEQ notfound
1286
+
1287
+ MOVDU 8 ( R8 ) , R12
1288
+ CMPB R12 , R5 , R3
1289
+ CMPU R3 , $ 0 , CR6
1290
+ CMPU R8 , R7
1291
+ BNE CR6 , done
1292
+ BR notfound
1163
1293
1164
1294
TEXT runtime·cmpstring(SB) , NOSPLIT|NOFRAME , $ 0 - 40
1165
1295
MOVD s1_base + 0 (FP) , R5
0 commit comments