Skip to content

Commit 816e7cd

Browse files
authored
AMDGPU: Factor agpr reg_sequence folding into a function (#129002)
1 parent 65c45bf commit 816e7cd

File tree

1 file changed

+87
-79
lines changed

1 file changed

+87
-79
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 87 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ class SIFoldOperandsImpl {
128128
bool tryFoldCndMask(MachineInstr &MI) const;
129129
bool tryFoldZeroHighBits(MachineInstr &MI) const;
130130
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
131+
132+
bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
131133
bool tryFoldFoldableCopy(MachineInstr &MI,
132134
MachineOperand *&CurrentKnownM0Val) const;
133135

@@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand(
10121014
UseMI->getOperand(0).getReg().isVirtual() &&
10131015
!UseMI->getOperand(1).getSubReg()) {
10141016
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
1015-
unsigned Size = TII->getOpSize(*UseMI, 1);
10161017
Register UseReg = OpToFold.getReg();
10171018
UseMI->getOperand(1).setReg(UseReg);
10181019
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
@@ -1021,84 +1022,9 @@ void SIFoldOperandsImpl::foldOperand(
10211022
OpToFold.setIsKill(false);
10221023

10231024
// Remove kill flags as kills may now be out of order with uses.
1024-
MRI->clearKillFlags(OpToFold.getReg());
1025-
1026-
// That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
1027-
// can only accept VGPR or inline immediate. Recreate a reg_sequence with
1028-
// its initializers right here, so we will rematerialize immediates and
1029-
// avoid copies via different reg classes.
1030-
SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
1031-
if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1032-
getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1033-
const DebugLoc &DL = UseMI->getDebugLoc();
1034-
MachineBasicBlock &MBB = *UseMI->getParent();
1035-
1036-
UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1037-
for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
1038-
UseMI->removeOperand(I);
1039-
1040-
MachineInstrBuilder B(*MBB.getParent(), UseMI);
1041-
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1042-
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
1043-
for (unsigned I = 0; I < Size / 4; ++I) {
1044-
MachineOperand *Def = Defs[I].first;
1045-
TargetInstrInfo::RegSubRegPair CopyToVGPR;
1046-
if (Def->isImm() &&
1047-
TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1048-
int64_t Imm = Def->getImm();
1049-
1050-
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1051-
BuildMI(MBB, UseMI, DL,
1052-
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
1053-
B.addReg(Tmp);
1054-
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
1055-
auto Src = getRegSubRegPair(*Def);
1056-
Def->setIsKill(false);
1057-
if (!SeenAGPRs.insert(Src)) {
1058-
// We cannot build a reg_sequence out of the same registers, they
1059-
// must be copied. Better do it here before copyPhysReg() created
1060-
// several reads to do the AGPR->VGPR->AGPR copy.
1061-
CopyToVGPR = Src;
1062-
} else {
1063-
B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
1064-
Src.SubReg);
1065-
}
1066-
} else {
1067-
assert(Def->isReg());
1068-
Def->setIsKill(false);
1069-
auto Src = getRegSubRegPair(*Def);
1070-
1071-
// Direct copy from SGPR to AGPR is not possible. To avoid creation
1072-
// of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1073-
// create a copy here and track if we already have such a copy.
1074-
if (TRI->isSGPRReg(*MRI, Src.Reg)) {
1075-
CopyToVGPR = Src;
1076-
} else {
1077-
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1078-
BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
1079-
B.addReg(Tmp);
1080-
}
1081-
}
1082-
1083-
if (CopyToVGPR.Reg) {
1084-
auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
1085-
Register &Vgpr = It->second;
1086-
if (Inserted) {
1087-
Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1088-
BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
1089-
}
1090-
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1091-
BuildMI(MBB, UseMI, DL,
1092-
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
1093-
B.addReg(Tmp);
1094-
}
1095-
1096-
B.addImm(Defs[I].second);
1097-
}
1098-
LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
1099-
}
1100-
1101-
return;
1025+
MRI->clearKillFlags(UseReg);
1026+
if (foldCopyToAGPRRegSequence(UseMI))
1027+
return;
11021028
}
11031029

11041030
unsigned UseOpc = UseMI->getOpcode();
@@ -1558,6 +1484,88 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
15581484
return true;
15591485
}
15601486

1487+
/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1488+
/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1489+
bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1490+
// It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1491+
// only accept VGPR or inline immediate. Recreate a reg_sequence with its
1492+
// initializers right here, so we will rematerialize immediates and avoid
1493+
// copies via different reg classes.
1494+
if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg()))
1495+
return false;
1496+
Register UseReg = CopyMI->getOperand(1).getReg();
1497+
SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
1498+
if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
1499+
return false;
1500+
1501+
const DebugLoc &DL = CopyMI->getDebugLoc();
1502+
MachineBasicBlock &MBB = *CopyMI->getParent();
1503+
1504+
CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1505+
for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1506+
CopyMI->removeOperand(I);
1507+
1508+
MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1509+
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1510+
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
1511+
for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) {
1512+
MachineOperand *Def = Defs[I].first;
1513+
TargetInstrInfo::RegSubRegPair CopyToVGPR;
1514+
if (Def->isImm() &&
1515+
TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1516+
int64_t Imm = Def->getImm();
1517+
1518+
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1519+
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1520+
.addImm(Imm);
1521+
B.addReg(Tmp);
1522+
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
1523+
auto Src = getRegSubRegPair(*Def);
1524+
Def->setIsKill(false);
1525+
if (!SeenAGPRs.insert(Src)) {
1526+
// We cannot build a reg_sequence out of the same registers, they
1527+
// must be copied. Better do it here before copyPhysReg() created
1528+
// several reads to do the AGPR->VGPR->AGPR copy.
1529+
CopyToVGPR = Src;
1530+
} else {
1531+
B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg);
1532+
}
1533+
} else {
1534+
assert(Def->isReg());
1535+
Def->setIsKill(false);
1536+
auto Src = getRegSubRegPair(*Def);
1537+
1538+
// Direct copy from SGPR to AGPR is not possible. To avoid creation
1539+
// of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1540+
// create a copy here and track if we already have such a copy.
1541+
if (TRI->isSGPRReg(*MRI, Src.Reg)) {
1542+
CopyToVGPR = Src;
1543+
} else {
1544+
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1545+
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
1546+
B.addReg(Tmp);
1547+
}
1548+
}
1549+
1550+
if (CopyToVGPR.Reg) {
1551+
auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
1552+
Register &Vgpr = It->second;
1553+
if (Inserted) {
1554+
Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1555+
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
1556+
}
1557+
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1558+
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1559+
.addReg(Vgpr);
1560+
B.addReg(Tmp);
1561+
}
1562+
1563+
B.addImm(Defs[I].second);
1564+
}
1565+
LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1566+
return true;
1567+
}
1568+
15611569
bool SIFoldOperandsImpl::tryFoldFoldableCopy(
15621570
MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
15631571
Register DstReg = MI.getOperand(0).getReg();

0 commit comments

Comments
 (0)