-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[LoongArch] Optimize for immediate value materialization using BSTRINS_D instruction #106332
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Created using spr 1.3.5-bogner
@llvm/pr-subscribers-backend-loongarch @llvm/pr-subscribers-mc Author: wanglei (wangleiat) ChangesFull diff: https://github.com/llvm/llvm-project/pull/106332.diff 10 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index c2ae4a0734b6a7..b8f1cdfd2cb354 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -1291,14 +1291,32 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,
Imm = SignExtend64<32>(Imm);
for (LoongArchMatInt::Inst &Inst : LoongArchMatInt::generateInstSeq(Imm)) {
- unsigned Opc = Inst.Opc;
- if (Opc == LoongArch::LU12I_W)
- Out.emitInstruction(MCInstBuilder(Opc).addReg(DestReg).addImm(Inst.Imm),
- getSTI());
- else
+ switch (Inst.Opc) {
+ case LoongArch::LU12I_W:
Out.emitInstruction(
- MCInstBuilder(Opc).addReg(DestReg).addReg(SrcReg).addImm(Inst.Imm),
+ MCInstBuilder(Inst.Opc).addReg(DestReg).addImm(Inst.Imm), getSTI());
+ break;
+ case LoongArch::ADDI_W:
+ case LoongArch::ORI:
+ case LoongArch::LU32I_D:
+ case LoongArch::LU52I_D:
+ Out.emitInstruction(
+ MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm(
+ Inst.Imm),
getSTI());
+ break;
+ case LoongArch::BSTRINS_D:
+ Out.emitInstruction(MCInstBuilder(Inst.Opc)
+ .addReg(DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg)
+ .addImm(Inst.Imm >> 32)
+ .addImm(Inst.Imm & 0xFF),
+ getSTI());
+ break;
+ default:
+ llvm_unreachable("unexpected opcode generated by LoongArchMatInt");
+ }
SrcReg = DestReg;
}
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
index b6ade6b978d2ce..70ed1e6fbdbdac 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
@@ -62,10 +62,26 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
// The instructions in the sequence are handled here.
for (LoongArchMatInt::Inst &Inst : LoongArchMatInt::generateInstSeq(Imm)) {
SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, GRLenVT);
- if (Inst.Opc == LoongArch::LU12I_W)
- Result = CurDAG->getMachineNode(LoongArch::LU12I_W, DL, GRLenVT, SDImm);
- else
+ switch (Inst.Opc) {
+ case LoongArch::LU12I_W:
+ Result = CurDAG->getMachineNode(Inst.Opc, DL, GRLenVT, SDImm);
+ break;
+ case LoongArch::ADDI_W:
+ case LoongArch::ORI:
+ case LoongArch::LU32I_D:
+ case LoongArch::LU52I_D:
Result = CurDAG->getMachineNode(Inst.Opc, DL, GRLenVT, SrcReg, SDImm);
+ break;
+ case LoongArch::BSTRINS_D:
+ Result = CurDAG->getMachineNode(
+ Inst.Opc, DL, GRLenVT,
+ {SrcReg, SrcReg,
+ CurDAG->getTargetConstant(Inst.Imm >> 32, DL, GRLenVT),
+ CurDAG->getTargetConstant(Inst.Imm & 0xFF, DL, GRLenVT)});
+ break;
+ default:
+ llvm_unreachable("unexpected opcode generated by LoongArchMatInt");
+ }
SrcReg = SDValue(Result, 0);
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 9059da460f1358..d1af65192ee612 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -210,6 +210,14 @@ void LoongArchInstrInfo::movImm(MachineBasicBlock &MBB,
.addImm(Inst.Imm)
.setMIFlag(Flag);
break;
+ case LoongArch::BSTRINS_D:
+ BuildMI(MBB, MBBI, DL, get(Inst.Opc), DstReg)
+ .addReg(SrcReg, RegState::Kill)
+ .addReg(SrcReg, RegState::Kill)
+ .addImm(Inst.Imm >> 32)
+ .addImm(Inst.Imm & 0xFF)
+ .setMIFlag(Flag);
+ break;
default:
assert(false && "Unknown insn emitted by LoongArchMatInt");
}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
index 1509c436c81098..de5f6ea9aba04c 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
@@ -26,11 +26,13 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) {
const int64_t Lo12 = Val & 0xFFF;
InstSeq Insts;
+ // LU52I_D used for: Bits[63:52] | Bits[51:0].
if (Highest12 != 0 && SignExtend64<52>(Val) == 0) {
Insts.push_back(Inst(LoongArch::LU52I_D, SignExtend64<12>(Highest12)));
return Insts;
}
+ // lo32
if (Hi20 == 0)
Insts.push_back(Inst(LoongArch::ORI, Lo12));
else if (SignExtend32<1>(Lo12 >> 11) == SignExtend32<20>(Hi20))
@@ -41,11 +43,82 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) {
Insts.push_back(Inst(LoongArch::ORI, Lo12));
}
+ // hi32
+ // Higher20
if (SignExtend32<1>(Hi20 >> 19) != SignExtend32<20>(Higher20))
Insts.push_back(Inst(LoongArch::LU32I_D, SignExtend64<20>(Higher20)));
+ // Highest12
if (SignExtend32<1>(Higher20 >> 19) != SignExtend32<12>(Highest12))
Insts.push_back(Inst(LoongArch::LU52I_D, SignExtend64<12>(Highest12)));
+ size_t N = Insts.size();
+ if (N < 3)
+ return Insts;
+
+ // When the number of instruction sequences is greater than 2, we have the
+ // opportunity to optimize using the BSTRINS_D instruction. The scenario is as
+ // follows:
+ //
+ // N of Insts = 3
+ // 1. ORI + LU32I_D + LU52I_D => ORI + BSTRINS_D, TmpVal = ORI
+ // 2. ADDI_W + LU32I_D + LU32I_D => ADDI_W + BSTRINS_D, TmpVal = ADDI_W
+ // 3. LU12I_W + ORI + LU32I_D => ORI + BSTRINS_D, TmpVal = ORI
+ // 4. LU12I_W + LU32I_D + LU52I_D => LU12I_W + BSTRINS_D, TmpVal = LU12I_W
+ //
+ // N of Insts = 4
+ // 5. LU12I_W + ORI + LU32I_D + LU52I_D => LU12I_W + ORI + BSTRINS_D
+ // => ORI + LU52I_D + BSTRINS_D
+ // TmpVal = (LU12I_W | ORI) or (ORI | LU52I_D)
+ // The BSTRINS_D instruction will use the `TmpVal` to construct the `Val`.
+ uint64_t TmpVal1 = 0;
+ uint64_t TmpVal2 = 0;
+ switch (Insts[0].Opc) {
+ default:
+ llvm_unreachable("unexpected opcode");
+ break;
+ case LoongArch::LU12I_W:
+ if (Insts[1].Opc == LoongArch::ORI) {
+ TmpVal1 = Insts[1].Imm;
+ if (N == 3)
+ break;
+ TmpVal2 = Insts[3].Imm << 52 | TmpVal1;
+ }
+ TmpVal1 |= Insts[0].Imm << 12;
+ break;
+ case LoongArch::ORI:
+ case LoongArch::ADDI_W:
+ TmpVal1 = Insts[0].Imm;
+ break;
+ }
+
+ for (uint64_t Msb = 32; Msb < 64; ++Msb) {
+ uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1);
+ for (uint64_t Lsb = Msb; Lsb > 0; --Lsb) {
+ uint64_t LowMask = (1ULL << Lsb) - 1;
+ uint64_t Mask = HighMask | LowMask;
+ uint64_t LsbToZero = TmpVal1 & ((1UL << (Msb - Lsb + 1)) - 1);
+ uint64_t MsbToLsb = LsbToZero << Lsb;
+ if ((MsbToLsb | (TmpVal1 & Mask)) == (uint64_t)Val) {
+ if (Insts[1].Opc == LoongArch::ORI && N == 3)
+ Insts[0] = Insts[1];
+ Insts.pop_back_n(2);
+ Insts.push_back(Inst(LoongArch::BSTRINS_D, Msb << 32 | Lsb));
+ return Insts;
+ }
+ if (TmpVal2 != 0) {
+ LsbToZero = TmpVal2 & ((1UL << (Msb - Lsb + 1)) - 1);
+ MsbToLsb = LsbToZero << Lsb;
+ if ((MsbToLsb | (TmpVal2 & Mask)) == (uint64_t)Val) {
+ Insts[0] = Insts[1];
+ Insts[1] = Insts[3];
+ Insts.pop_back_n(2);
+ Insts.push_back(Inst(LoongArch::BSTRINS_D, Msb << 32 | Lsb));
+ return Insts;
+ }
+ }
+ }
+ }
+
return Insts;
}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h
index be1b425894de1a..3a3c12c353fb8e 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h
@@ -16,6 +16,7 @@ namespace llvm {
namespace LoongArchMatInt {
struct Inst {
unsigned Opc;
+ // Imm: Opc's imm operand, if Opc == BSTRINS_D, Imm = MSB << 32 | LSB.
int64_t Imm;
Inst(unsigned Opc, int64_t Imm) : Opc(Opc), Imm(Imm) {}
};
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index f17cec231f3236..3efdd08bbea4c4 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -338,14 +338,12 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
; LA64-NEXT: srli.d $a1, $a0, 1
; LA64-NEXT: lu12i.w $a2, 349525
; LA64-NEXT: ori $a2, $a2, 1365
-; LA64-NEXT: lu32i.d $a2, 349525
-; LA64-NEXT: lu52i.d $a2, $a2, 1365
+; LA64-NEXT: bstrins.d $a2, $a2, 62, 32
; LA64-NEXT: and $a1, $a1, $a2
; LA64-NEXT: sub.d $a0, $a0, $a1
; LA64-NEXT: lu12i.w $a1, 209715
; LA64-NEXT: ori $a1, $a1, 819
-; LA64-NEXT: lu32i.d $a1, 209715
-; LA64-NEXT: lu52i.d $a1, $a1, 819
+; LA64-NEXT: bstrins.d $a1, $a1, 61, 32
; LA64-NEXT: and $a2, $a0, $a1
; LA64-NEXT: srli.d $a0, $a0, 2
; LA64-NEXT: and $a0, $a0, $a1
@@ -354,13 +352,11 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
; LA64-NEXT: add.d $a0, $a0, $a1
; LA64-NEXT: lu12i.w $a1, 61680
; LA64-NEXT: ori $a1, $a1, 3855
-; LA64-NEXT: lu32i.d $a1, -61681
-; LA64-NEXT: lu52i.d $a1, $a1, 240
+; LA64-NEXT: bstrins.d $a1, $a1, 59, 32
; LA64-NEXT: and $a0, $a0, $a1
; LA64-NEXT: lu12i.w $a1, 4112
; LA64-NEXT: ori $a1, $a1, 257
-; LA64-NEXT: lu32i.d $a1, 65793
-; LA64-NEXT: lu52i.d $a1, $a1, 16
+; LA64-NEXT: bstrins.d $a1, $a1, 56, 32
; LA64-NEXT: mul.d $a0, $a0, $a1
; LA64-NEXT: srli.d $a0, $a0, 56
; LA64-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/imm.ll b/llvm/test/CodeGen/LoongArch/imm.ll
index 746306bacc8d57..aca508e99fb960 100644
--- a/llvm/test/CodeGen/LoongArch/imm.ll
+++ b/llvm/test/CodeGen/LoongArch/imm.ll
@@ -47,8 +47,7 @@ define i64 @imm0008000000000fff() {
; CHECK-LABEL: imm0008000000000fff:
; CHECK: # %bb.0:
; CHECK-NEXT: ori $a0, $zero, 4095
-; CHECK-NEXT: lu32i.d $a0, -524288
-; CHECK-NEXT: lu52i.d $a0, $a0, 0
+; CHECK-NEXT: bstrins.d $a0, $a0, 51, 51
; CHECK-NEXT: ret
ret i64 2251799813689343
}
@@ -168,9 +167,8 @@ define i64 @imm0008000080000800() {
define i64 @imm14000000a() {
; CHECK-LABEL: imm14000000a:
; CHECK: # %bb.0:
-; CHECK-NEXT: lu12i.w $a0, 262144
-; CHECK-NEXT: ori $a0, $a0, 10
-; CHECK-NEXT: lu32i.d $a0, 1
+; CHECK-NEXT: ori $a0, $zero, 10
+; CHECK-NEXT: bstrins.d $a0, $a0, 32, 29
; CHECK-NEXT: ret
ret i64 5368709130
}
@@ -179,8 +177,7 @@ define i64 @imm0fff000000000fff() {
; CHECK-LABEL: imm0fff000000000fff:
; CHECK: # %bb.0:
; CHECK-NEXT: ori $a0, $zero, 4095
-; CHECK-NEXT: lu32i.d $a0, -65536
-; CHECK-NEXT: lu52i.d $a0, $a0, 255
+; CHECK-NEXT: bstrins.d $a0, $a0, 59, 48
; CHECK-NEXT: ret
ret i64 1152640029630140415
}
@@ -189,8 +186,7 @@ define i64 @immffecffffffffffec() {
; CHECK-LABEL: immffecffffffffffec:
; CHECK: # %bb.0:
; CHECK-NEXT: addi.w $a0, $zero, -20
-; CHECK-NEXT: lu32i.d $a0, -196609
-; CHECK-NEXT: lu52i.d $a0, $a0, -2
+; CHECK-NEXT: bstrins.d $a0, $a0, 52, 48
; CHECK-NEXT: ret
ret i64 -5348024557502484
}
@@ -199,8 +195,7 @@ define i64 @imm1c000000700000() {
; CHECK-LABEL: imm1c000000700000:
; CHECK: # %bb.0:
; CHECK-NEXT: lu12i.w $a0, 1792
-; CHECK-NEXT: lu32i.d $a0, -262144
-; CHECK-NEXT: lu52i.d $a0, $a0, 1
+; CHECK-NEXT: bstrins.d $a0, $a0, 52, 30
; CHECK-NEXT: ret
ret i64 7881299355238400
}
@@ -210,8 +205,7 @@ define i64 @immf0f0f0f0f0f0f0f0() {
; CHECK: # %bb.0:
; CHECK-NEXT: lu12i.w $a0, -61681
; CHECK-NEXT: ori $a0, $a0, 240
-; CHECK-NEXT: lu32i.d $a0, 61680
-; CHECK-NEXT: lu52i.d $a0, $a0, -241
+; CHECK-NEXT: bstrins.d $a0, $a0, 59, 32
; CHECK-NEXT: ret
ret i64 -1085102592571150096
}
@@ -219,10 +213,9 @@ define i64 @immf0f0f0f0f0f0f0f0() {
define i64 @imm110000014000000a() {
; CHECK-LABEL: imm110000014000000a:
; CHECK: # %bb.0:
-; CHECK-NEXT: lu12i.w $a0, 262144
-; CHECK-NEXT: ori $a0, $a0, 10
-; CHECK-NEXT: lu32i.d $a0, 1
+; CHECK-NEXT: ori $a0, $zero, 10
; CHECK-NEXT: lu52i.d $a0, $a0, 272
+; CHECK-NEXT: bstrins.d $a0, $a0, 32, 29
; CHECK-NEXT: ret
ret i64 1224979104013484042
}
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
index 772ae8d81a88bf..9654542f877459 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
@@ -973,9 +973,8 @@ define i64 @ld_sd_constant(i64 %a) nounwind {
; LA64NOPIC-LABEL: ld_sd_constant:
; LA64NOPIC: # %bb.0:
; LA64NOPIC-NEXT: lu12i.w $a1, -136485
-; LA64NOPIC-NEXT: ori $a1, $a1, 3823
-; LA64NOPIC-NEXT: lu32i.d $a1, -147729
-; LA64NOPIC-NEXT: lu52i.d $a2, $a1, -534
+; LA64NOPIC-NEXT: ori $a2, $a1, 3823
+; LA64NOPIC-NEXT: bstrins.d $a2, $a2, 61, 32
; LA64NOPIC-NEXT: ld.d $a1, $a2, 0
; LA64NOPIC-NEXT: st.d $a0, $a2, 0
; LA64NOPIC-NEXT: move $a0, $a1
@@ -984,9 +983,8 @@ define i64 @ld_sd_constant(i64 %a) nounwind {
; LA64PIC-LABEL: ld_sd_constant:
; LA64PIC: # %bb.0:
; LA64PIC-NEXT: lu12i.w $a1, -136485
-; LA64PIC-NEXT: ori $a1, $a1, 3823
-; LA64PIC-NEXT: lu32i.d $a1, -147729
-; LA64PIC-NEXT: lu52i.d $a2, $a1, -534
+; LA64PIC-NEXT: ori $a2, $a1, 3823
+; LA64PIC-NEXT: bstrins.d $a2, $a2, 61, 32
; LA64PIC-NEXT: ld.d $a1, $a2, 0
; LA64PIC-NEXT: st.d $a0, $a2, 0
; LA64PIC-NEXT: move $a0, $a1
diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
index 1e7a79beb62c61..323858c7613a67 100644
--- a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
+++ b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
@@ -1128,8 +1128,7 @@ define dso_local ptr @load_addr_offset_614750729487779976() nounwind {
; LA64-NEXT: addi.d $a0, $a0, %pc_lo12(g_a64)
; LA64-NEXT: lu12i.w $a1, 279556
; LA64-NEXT: ori $a1, $a1, 1088
-; LA64-NEXT: lu32i.d $a1, 17472
-; LA64-NEXT: lu52i.d $a1, $a1, 1092
+; LA64-NEXT: bstrins.d $a1, $a1, 62, 32
; LA64-NEXT: add.d $a0, $a0, $a1
; LA64-NEXT: ret
;
@@ -1142,8 +1141,7 @@ define dso_local ptr @load_addr_offset_614750729487779976() nounwind {
; LA64-LARGE-NEXT: add.d $a0, $a1, $a0
; LA64-LARGE-NEXT: lu12i.w $a1, 279556
; LA64-LARGE-NEXT: ori $a1, $a1, 1088
-; LA64-LARGE-NEXT: lu32i.d $a1, 17472
-; LA64-LARGE-NEXT: lu52i.d $a1, $a1, 1092
+; LA64-LARGE-NEXT: bstrins.d $a1, $a1, 62, 32
; LA64-LARGE-NEXT: add.d $a0, $a0, $a1
; LA64-LARGE-NEXT: ret
entry:
diff --git a/llvm/test/MC/LoongArch/Macros/macros-li.s b/llvm/test/MC/LoongArch/Macros/macros-li.s
index 994aa439effa1b..8ac82a766f6043 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-li.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-li.s
@@ -45,8 +45,7 @@ li.d $a0, 0x7ffff00000800
li.d $a0, 0x8000000000fff
# CHECK: ori $a0, $zero, 4095
-# CHECK-NEXT: lu32i.d $a0, -524288
-# CHECK-NEXT: lu52i.d $a0, $a0, 0
+# CHECK-NEXT: bstrins.d $a0, $a0, 51, 51
li.d $a0, 0x8000080000800
# CHECK: lu12i.w $a0, -524288
|
Created using spr 1.3.5-bogner
Created using spr 1.3.5-bogner
2478ae4
into
users/wangleiat/spr/main.loongarch-optimize-for-immediate-value-materialization-using-bstrins_d-instruction
…S_D instruction Reviewed By: heiher, SixWeining Pull Request: #106332
We have some tests failing on our 32 bit single stage Arm builder. I'm looking into it locally. Usually this is some assumption about type sizes such as size_t. |
Fixed with c55e24b. Some UL that should have been ULL, very easy mistake to make. |
Follow up fix to #106332 `LoongArchMatInt.cpp:96:33: runtime error: shift exponent 64 is too large for 64-bit type` https://lab.llvm.org/buildbot/#/builders/169/builds/2681
Fixed another UB from #106332. Detected here https://lab.llvm.org/buildbot/#/builders/169/builds/2662
No description provided.