Skip to content

Commit 3613104

Browse files
pmurpull[bot]
authored andcommitted
cmd/internal/asm/ppc64: avoid generating exser nops
"OR $0, R31, R31" is the execution serializing nop called "exser" on ISA 3.1 processors such as Power10. In general, the "OR $0, Rx, Rx" where Rx != 0 form should be avoided unless used explicitly for the uarch side-effects. Change-Id: Id76e3a703c902676ba4a3ffb64dd90dad9a320bf Reviewed-on: https://go-review.googlesource.com/c/go/+/537855 TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Lynn Boger <[email protected]> Reviewed-by: Cherry Mui <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Run-TryBot: Paul Murphy <[email protected]> Reviewed-by: Heschi Kreinick <[email protected]>
1 parent e880233 commit 3613104

File tree

2 files changed

+37
-14
lines changed

2 files changed

+37
-14
lines changed

src/cmd/asm/internal/asm/testdata/ppc64.s

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,9 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
179179
ADD $-32768, R6 // 38c68000
180180
ADD $-32768, R6, R5 // 38a68000
181181
// Hex constant 0xFFFFFFFE00000000
182-
ADD $-8589934592, R5 // 3fe0fffe63ff00007bff83e463ff00007cbf2a14 or 0602000038a50000
182+
ADD $-8589934592, R5 // 3fe0fffe600000007bff83e4600000007cbf2a14 or 0602000038a50000
183+
// Hex constant 0xFFFFFFFE00010001
184+
ADD $-8589869055, R5 // 3fe0fffe63ff00017bff83e463ff00017cbf2a14 or 0602000138a50001
183185

184186
//TODO: this compiles to add r5,r6,r0. It should be addi r5,r6,0.
185187
// this is OK since r0 == $0, but the latter is preferred.
@@ -223,6 +225,8 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
223225
OR $-32768, R6, R7 // 3be080007fe73378
224226
OR $1234567, R5 // 641f001263ffd6877fe52b78
225227
OR $1234567, R5, R3 // 641f001263ffd6877fe32b78
228+
OR $2147483648, R5, R3 // 641f8000600000007fe32b78
229+
OR $2147483649, R5, R3 // 641f800063ff00017fe32b78
226230
ORIS $255, R3, R4
227231

228232
XOR $1, R3 // 68630001
@@ -249,7 +253,6 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
249253
CMPB R3,R4,R4 // 7c6423f8
250254
CMPEQB R3,R4,CR6 // 7f0321c0
251255

252-
// TODO: constants for ADDC?
253256
ADD R3, R4 // 7c841a14
254257
ADD R3, R4, R5 // 7ca41a14
255258
ADDC R3, R4 // 7c841814
@@ -262,6 +265,8 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
262265
ADDV R3, R4 // 7c841e14
263266
ADDVCC R3, R4 // 7c841e15
264267
ADDCCC R3, R4, R5 // 7ca41815
268+
ADDCCC $65536, R4, R5 // 641f0001600000007cbf2015
269+
ADDCCC $65537, R4, R5 // 641f000163ff00017cbf2015
265270
ADDME R3, R4 // 7c8301d4
266271
ADDMECC R3, R4 // 7c8301d5
267272
ADDMEV R3, R4 // 7c8305d4
@@ -315,6 +320,8 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
315320
SUBECC R3, R4, R5 // 7ca32111
316321
SUBEV R3, R4, R5 // 7ca32510
317322
SUBEVCC R3, R4, R5 // 7ca32511
323+
SUBC R3, $65536, R4 // 3fe00001600000007c83f810
324+
SUBC R3, $65537, R4 // 3fe0000163ff00017c83f810
318325

319326
MULLW R3, R4 // 7c8419d6
320327
MULLW R3, R4, R5 // 7ca419d6

src/cmd/internal/obj/ppc64/asm9.go

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ const (
6565
PFX_R_PCREL = 1 // Offset is relative to PC, RA should be 0
6666
)
6767

68+
const (
69+
// The preferred hardware nop instruction.
70+
NOP = 0x60000000
71+
)
72+
6873
type Optab struct {
6974
as obj.As // Opcode
7075
a1 uint8 // p.From argument (obj.Addr). p is of type obj.Prog.
@@ -831,7 +836,6 @@ func span9(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
831836
// lay out the code, emitting code and data relocations.
832837

833838
bp := c.cursym.P
834-
nop := LOP_IRR(OP_ORI, REGZERO, REGZERO, 0)
835839
var i int32
836840
for p := c.cursym.Func().Text.Link; p != nil; p = p.Link {
837841
c.pc = p.Pc
@@ -846,13 +850,13 @@ func span9(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
846850
if v > 0 {
847851
// Same padding instruction for all
848852
for i = 0; i < int32(v/4); i++ {
849-
c.ctxt.Arch.ByteOrder.PutUint32(bp, nop)
853+
c.ctxt.Arch.ByteOrder.PutUint32(bp, NOP)
850854
bp = bp[4:]
851855
}
852856
}
853857
} else {
854858
if p.Mark&PFX_X64B != 0 {
855-
c.ctxt.Arch.ByteOrder.PutUint32(bp, nop)
859+
c.ctxt.Arch.ByteOrder.PutUint32(bp, NOP)
856860
bp = bp[4:]
857861
}
858862
o.asmout(&c, p, o, &out)
@@ -2531,6 +2535,18 @@ func decodeMask64(mask int64) (mb, me uint32, valid bool) {
25312535
return mb, (me - 1) & 63, valid
25322536
}
25332537

2538+
// Load the lower 16 bits of a constant into register r.
2539+
func loadl16(r int, d int64) uint32 {
2540+
v := uint16(d)
2541+
if v == 0 {
2542+
// Avoid generating "ori r,r,0", r != 0. Instead, generate the architectually preferred nop.
2543+
// For example, "ori r31,r31,0" is a special execution serializing nop on Power10 called "exser".
2544+
return NOP
2545+
}
2546+
return LOP_IRR(OP_ORI, uint32(r), uint32(r), uint32(v))
2547+
}
2548+
2549+
// Load the upper 16 bits of a 32b constant into register r.
25342550
func loadu32(r int, d int64) uint32 {
25352551
v := int32(d >> 16)
25362552
if isuint32(uint64(d)) {
@@ -2734,7 +2750,7 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
27342750
rel.Add = int64(v)
27352751
rel.Type = objabi.R_CALLPOWER
27362752
}
2737-
o2 = 0x60000000 // nop, sometimes overwritten by ld r2, 24(r1) when dynamic linking
2753+
o2 = NOP // nop, sometimes overwritten by ld r2, 24(r1) when dynamic linking
27382754

27392755
case 13: /* mov[bhwd]{z,} r,r */
27402756
// This needs to handle "MOV* $0, Rx". This shows up because $0 also
@@ -2957,14 +2973,14 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
29572973
} else if o.size == 12 {
29582974
// Note, o1 is ADDIS if d is negative, ORIS otherwise.
29592975
o1 = loadu32(REGTMP, d) // tmp = d & 0xFFFF0000
2960-
o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d))) // tmp |= d & 0xFFFF
2976+
o2 = loadl16(REGTMP, d) // tmp |= d & 0xFFFF
29612977
o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r)) // to = from + tmp
29622978
} else {
29632979
// For backwards compatibility with GOPPC64 < 10, generate 34b constants in register.
2964-
o1 = LOP_IRR(OP_ADDIS, REGZERO, REGTMP, uint32(d>>32)) // tmp = sign_extend((d>>32)&0xFFFF0000)
2965-
o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(d>>16)) // tmp |= (d>>16)&0xFFFF
2966-
o3 = AOP_MD(OP_RLDICR, REGTMP, REGTMP, 16, 63-16) // tmp <<= 16
2967-
o4 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(uint16(d))) // tmp |= d&0xFFFF
2980+
o1 = LOP_IRR(OP_ADDIS, REGZERO, REGTMP, uint32(d>>32)) // tmp = sign_extend((d>>32)&0xFFFF0000)
2981+
o2 = loadl16(REGTMP, int64(d>>16)) // tmp |= (d>>16)&0xFFFF
2982+
o3 = AOP_MD(OP_RLDICR, REGTMP, REGTMP, 16, 63-16) // tmp <<= 16
2983+
o4 = loadl16(REGTMP, int64(uint16(d))) // tmp |= d&0xFFFF
29682984
o5 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
29692985
}
29702986

@@ -2985,7 +3001,7 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
29853001
o2 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
29863002
} else {
29873003
o1 = loadu32(REGTMP, d)
2988-
o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
3004+
o2 = loadl16(REGTMP, d)
29893005
o3 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
29903006
}
29913007
if p.From.Sym != nil {
@@ -3081,9 +3097,9 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
30813097
if p.To.Reg == REGTMP || p.From.Reg == REGTMP {
30823098
c.ctxt.Diag("can't synthesize large constant\n%v", p)
30833099
}
3084-
v := c.regoff(p.GetFrom3())
3100+
v := c.vregoff(p.GetFrom3())
30853101
o1 = AOP_IRR(OP_ADDIS, REGTMP, REGZERO, uint32(v)>>16)
3086-
o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(v))
3102+
o2 = loadl16(REGTMP, v)
30873103
o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), uint32(p.From.Reg), REGTMP)
30883104
if p.From.Sym != nil {
30893105
c.ctxt.Diag("%v is not supported", p)

0 commit comments

Comments
 (0)