-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Do not allow M0 as v_readfirstlane_b32 dst #128851
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
M0 can only be written to by the SALU, so `v_readfirstlane_b32 m0` is effectively useless. Represent this by restricting the dest RC of that instruction to `SReg_32_XM0` which excludes M0. There is a lot of test changes due to the register class changing, but most changes are trivial. In some cases, an extra register and `s_mov_b32` is needed. Fixes SWDEV-513269
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-llvm-analysis Author: Pierre van Houtryve (Pierre-vh) ChangesM0 can only be written to by the SALU, so There is a lot of test changes due to the register class changing, but most changes are trivial. In some cases, an extra register and Fixes SWDEV-513269 Patch is 1.86 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128851.diff 149 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2693ad3894cca..96c918a9a7f76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
for (unsigned i = 0; i < NumParts; ++i) {
Register SrcPart = SrcParts[i];
- Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
const TargetRegisterClass *Constrained =
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 308ab8e3b82c4..2ed313eac649e 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -271,6 +271,7 @@ DECODE_OPERAND_REG_8(VReg_512)
DECODE_OPERAND_REG_8(VReg_1024)
DECODE_OPERAND_REG_7(SReg_32, OPW32)
+DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ac69bf6d038ec..a99fd25477553 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1069,6 +1069,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
<< " is being turned to v_readfirstlane_b32"
<< " Score: " << C.second.Score << "\n");
Register DstReg = MI->getOperand(0).getReg();
+ MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
+
Register SrcReg = MI->getOperand(1).getReg();
unsigned SubReg = MI->getOperand(1).getSubReg();
const TargetRegisterClass *SrcRC =
@@ -1092,7 +1094,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
Result, *MRI, MI->getOperand(1), SrcRC,
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
Register PartialDst =
- MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MBB, *Result, Result->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
.addReg(PartialSrc);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index ce21f8963fe88..97736e2410c18 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -305,6 +305,7 @@ class PrologEpilogSGPRSpillBuilder {
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
TmpVGPR, FI, FrameReg, DwordOff);
+ MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
.addReg(TmpVGPR, RegState::Kill);
DwordOff += 4;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index be7cdde802b51..9a0eee6b44891 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4569,7 +4569,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Register PhiExec = MRI.createVirtualRegister(BoolRC);
Register NewExec = MRI.createVirtualRegister(BoolRC);
- Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurrentIdxReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register CondReg = MRI.createVirtualRegister(BoolRC);
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
@@ -5255,18 +5256,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32;
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
- Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
.addReg(Src0.getReg());
Src0.setReg(RegOp0);
}
if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
- Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
.addReg(Src1.getReg());
Src1.setReg(RegOp1);
}
- Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
if (TRI->isVectorRegister(MRI, Src2.getReg())) {
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
.addReg(Src2.getReg());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d5d54337306c0..d1670040591d9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6515,7 +6515,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
Register VScalarOp = ScalarOp->getReg();
if (NumSubRegs == 1) {
- Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
.addReg(VScalarOp);
@@ -6547,8 +6547,10 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
"Unhandled register size");
for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
- Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurRegLo =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register CurRegHi =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
// Read the next variant <- also loop target.
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
@@ -7657,9 +7659,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
if (Inst.isCopy() && DstReg.isPhysical() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
// TODO: Only works for 32 bit registers.
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
- .add(Inst.getOperand(1));
+ if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .add(Inst.getOperand(1));
+ } else {
+ Register NewDst =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
+ .add(Inst.getOperand(1));
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
+ DstReg)
+ .addReg(NewDst);
+ }
Inst.eraseFromParent();
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5a078873679cb..aef25c73641e4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2994,10 +2994,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (IsSALU && !LiveSCC)
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
if (IsSALU && LiveSCC) {
- Register NewDest =
- IsCopy ? ResultReg
- : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
- Shift, false, 0);
+ Register NewDest;
+ if (IsCopy) {
+ MF->getRegInfo().constrainRegClass(ResultReg,
+ &AMDGPU::SReg_32_XM0RegClass);
+ NewDest = ResultReg;
+ } else {
+ NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
+ Shift, false, 0);
+ }
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
.addReg(TmpResultReg);
ResultReg = NewDest;
@@ -3120,10 +3125,17 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addReg(TmpResultReg);
}
- Register NewDest = IsCopy ? ResultReg
- : RS->scavengeRegisterBackwards(
- AMDGPU::SReg_32RegClass, *Add,
- false, 0, /*AllowSpill=*/true);
+ Register NewDest;
+ if (IsCopy) {
+ MF->getRegInfo().constrainRegClass(ResultReg,
+ &AMDGPU::SReg_32_XM0RegClass);
+ NewDest = ResultReg;
+ } else {
+ NewDest = RS->scavengeRegisterBackwards(
+ AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
+ /*AllowSpill=*/true);
+ }
+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
NewDest)
.addReg(TmpResultReg);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index a407ae797a48b..def06c1e9a0d7 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -243,7 +243,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
} // End isMoveImm = 1
def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
- let DstRC = RegisterOperand<SReg_32>;
+ let DstRC = RegisterOperand<SReg_32_XM0>;
let Src0RC32 = VRegOrLdsSrc_32;
let Asm32 = " $vdst, $src0";
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir
index 9d15b8990bad3..6e1b5d641a8b7 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir
@@ -12,7 +12,7 @@ body: |
; CHECK-NEXT: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
- %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
+ %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
@@ -33,7 +33,7 @@ body: |
%4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
- %5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+ %5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
S_ENDPGM 0
...
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir
index 9a7e755e5f5c8..f7c874be87d36 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir
@@ -14,7 +14,7 @@ body: |
%0:vreg_64 = IMPLICIT_DEF
%1:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
%2:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
- %3:sreg_32 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
+ %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
S_ENDPGM 0
...
@@ -50,7 +50,7 @@ body: |
%1:vreg_64 = IMPLICIT_DEF
%2:vgpr_32(s32) = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
%3:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %4:sreg_32 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
+ %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
S_ENDPGM 0
...
@@ -104,7 +104,7 @@ body: |
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
- %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
+ %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
index f71f573e5a799..23931ac358843 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
@@ -104,9 +104,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@@ -131,9 +131,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -158,9 +158,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
@@ -187,9 +187,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -298,9 +298,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesM0 can only be written to by the SALU, so There is a lot of test changes due to the register class changing, but most changes are trivial. In some cases, an extra register and Fixes SWDEV-513269 Patch is 1.86 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128851.diff 149 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2693ad3894cca..96c918a9a7f76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
for (unsigned i = 0; i < NumParts; ++i) {
Register SrcPart = SrcParts[i];
- Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
const TargetRegisterClass *Constrained =
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 308ab8e3b82c4..2ed313eac649e 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -271,6 +271,7 @@ DECODE_OPERAND_REG_8(VReg_512)
DECODE_OPERAND_REG_8(VReg_1024)
DECODE_OPERAND_REG_7(SReg_32, OPW32)
+DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ac69bf6d038ec..a99fd25477553 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1069,6 +1069,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
<< " is being turned to v_readfirstlane_b32"
<< " Score: " << C.second.Score << "\n");
Register DstReg = MI->getOperand(0).getReg();
+ MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
+
Register SrcReg = MI->getOperand(1).getReg();
unsigned SubReg = MI->getOperand(1).getSubReg();
const TargetRegisterClass *SrcRC =
@@ -1092,7 +1094,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
Result, *MRI, MI->getOperand(1), SrcRC,
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
Register PartialDst =
- MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MBB, *Result, Result->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
.addReg(PartialSrc);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index ce21f8963fe88..97736e2410c18 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -305,6 +305,7 @@ class PrologEpilogSGPRSpillBuilder {
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
TmpVGPR, FI, FrameReg, DwordOff);
+ MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
.addReg(TmpVGPR, RegState::Kill);
DwordOff += 4;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index be7cdde802b51..9a0eee6b44891 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4569,7 +4569,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Register PhiExec = MRI.createVirtualRegister(BoolRC);
Register NewExec = MRI.createVirtualRegister(BoolRC);
- Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurrentIdxReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register CondReg = MRI.createVirtualRegister(BoolRC);
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
@@ -5255,18 +5256,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32;
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
- Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
.addReg(Src0.getReg());
Src0.setReg(RegOp0);
}
if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
- Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
.addReg(Src1.getReg());
Src1.setReg(RegOp1);
}
- Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
if (TRI->isVectorRegister(MRI, Src2.getReg())) {
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
.addReg(Src2.getReg());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d5d54337306c0..d1670040591d9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6515,7 +6515,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
Register VScalarOp = ScalarOp->getReg();
if (NumSubRegs == 1) {
- Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
.addReg(VScalarOp);
@@ -6547,8 +6547,10 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
"Unhandled register size");
for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
- Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurRegLo =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register CurRegHi =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
// Read the next variant <- also loop target.
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
@@ -7657,9 +7659,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
if (Inst.isCopy() && DstReg.isPhysical() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
// TODO: Only works for 32 bit registers.
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
- .add(Inst.getOperand(1));
+ if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .add(Inst.getOperand(1));
+ } else {
+ Register NewDst =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
+ .add(Inst.getOperand(1));
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
+ DstReg)
+ .addReg(NewDst);
+ }
Inst.eraseFromParent();
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5a078873679cb..aef25c73641e4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2994,10 +2994,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (IsSALU && !LiveSCC)
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
if (IsSALU && LiveSCC) {
- Register NewDest =
- IsCopy ? ResultReg
- : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
- Shift, false, 0);
+ Register NewDest;
+ if (IsCopy) {
+ MF->getRegInfo().constrainRegClass(ResultReg,
+ &AMDGPU::SReg_32_XM0RegClass);
+ NewDest = ResultReg;
+ } else {
+ NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
+ Shift, false, 0);
+ }
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
.addReg(TmpResultReg);
ResultReg = NewDest;
@@ -3120,10 +3125,17 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addReg(TmpResultReg);
}
- Register NewDest = IsCopy ? ResultReg
- : RS->scavengeRegisterBackwards(
- AMDGPU::SReg_32RegClass, *Add,
- false, 0, /*AllowSpill=*/true);
+ Register NewDest;
+ if (IsCopy) {
+ MF->getRegInfo().constrainRegClass(ResultReg,
+ &AMDGPU::SReg_32_XM0RegClass);
+ NewDest = ResultReg;
+ } else {
+ NewDest = RS->scavengeRegisterBackwards(
+ AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
+ /*AllowSpill=*/true);
+ }
+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
NewDest)
.addReg(TmpResultReg);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index a407ae797a48b..def06c1e9a0d7 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -243,7 +243,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
} // End isMoveImm = 1
def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
- let DstRC = RegisterOperand<SReg_32>;
+ let DstRC = RegisterOperand<SReg_32_XM0>;
let Src0RC32 = VRegOrLdsSrc_32;
let Asm32 = " $vdst, $src0";
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir
index 9d15b8990bad3..6e1b5d641a8b7 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir
@@ -12,7 +12,7 @@ body: |
; CHECK-NEXT: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
- %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
+ %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
@@ -33,7 +33,7 @@ body: |
%4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
- %5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+ %5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
S_ENDPGM 0
...
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir
index 9a7e755e5f5c8..f7c874be87d36 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir
@@ -14,7 +14,7 @@ body: |
%0:vreg_64 = IMPLICIT_DEF
%1:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
%2:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
- %3:sreg_32 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
+ %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
S_ENDPGM 0
...
@@ -50,7 +50,7 @@ body: |
%1:vreg_64 = IMPLICIT_DEF
%2:vgpr_32(s32) = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
%3:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %4:sreg_32 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
+ %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
S_ENDPGM 0
...
@@ -104,7 +104,7 @@ body: |
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
- %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
+ %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
index f71f573e5a799..23931ac358843 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
@@ -104,9 +104,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@@ -131,9 +131,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -158,9 +158,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
@@ -187,9 +187,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -298,9 +298,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(...
[truncated]
|
@@ -243,7 +243,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>; | |||
} // End isMoveImm = 1 | |||
|
|||
def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> { | |||
let DstRC = RegisterOperand<SReg_32>; | |||
let DstRC = RegisterOperand<SReg_32_XM0>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This also applies to readlane and writelane. I assume this misses those because they have different signatures
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not writelane
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
It's annoying that so many bits of C++ code duplicate the name of the regclass, which is already available from the instruction definition.
Should also do v_readlane_b32 as Matt said.
See llvm#128851 - this is the same patch, but for v_readlane_b32. This instruction is used much less often so there were less changes required.
See #128851 - this is the same patch, but for v_readlane_b32. This instruction is used much less often so there were less changes required.
…vm#128851) M0 can only be written to by the SALU, so `v_readfirstlane_b32 m0` is effectively useless. Represent this by restricting the dest RC of that instruction to `SReg_32_XM0` which excludes M0. There is a lot of test changes due to the register class changing, but most changes are trivial. In some cases, an extra register and `s_mov_b32` is needed. Fixes SWDEV-513269
…8867) See llvm#128851 - this is the same patch, but for v_readlane_b32. This instruction is used much less often so there were less changes required.
M0 can only be written to by the SALU, so
v_readfirstlane_b32 m0
is effectively useless. Represent this by restricting the dest RC of that instruction toSReg_32_XM0
which excludes M0.There is a lot of test changes due to the register class changing, but most changes are trivial. In some cases, an extra register and
s_mov_b32
is needed.Fixes SWDEV-513269