diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 3a019dbaad02c..f1ba199fbae3f 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -128,6 +128,8 @@ class SIFoldOperandsImpl { bool tryFoldCndMask(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + + bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const; bool tryFoldFoldableCopy(MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const; @@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand( UseMI->getOperand(0).getReg().isVirtual() && !UseMI->getOperand(1).getSubReg()) { LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); - unsigned Size = TII->getOpSize(*UseMI, 1); Register UseReg = OpToFold.getReg(); UseMI->getOperand(1).setReg(UseReg); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); @@ -1021,84 +1022,9 @@ void SIFoldOperandsImpl::foldOperand( OpToFold.setIsKill(false); // Remove kill flags as kills may now be out of order with uses. - MRI->clearKillFlags(OpToFold.getReg()); - - // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32 - // can only accept VGPR or inline immediate. Recreate a reg_sequence with - // its initializers right here, so we will rematerialize immediates and - // avoid copies via different reg classes. - SmallVector, 32> Defs; - if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && - getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { - const DebugLoc &DL = UseMI->getDebugLoc(); - MachineBasicBlock &MBB = *UseMI->getParent(); - - UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); - for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I) - UseMI->removeOperand(I); - - MachineInstrBuilder B(*MBB.getParent(), UseMI); - DenseMap VGPRCopies; - SmallSetVector SeenAGPRs; - for (unsigned I = 0; I < Size / 4; ++I) { - MachineOperand *Def = Defs[I].first; - TargetInstrInfo::RegSubRegPair CopyToVGPR; - if (Def->isImm() && - TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { - int64_t Imm = Def->getImm(); - - auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); - BuildMI(MBB, UseMI, DL, - TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm); - B.addReg(Tmp); - } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { - auto Src = getRegSubRegPair(*Def); - Def->setIsKill(false); - if (!SeenAGPRs.insert(Src)) { - // We cannot build a reg_sequence out of the same registers, they - // must be copied. Better do it here before copyPhysReg() created - // several reads to do the AGPR->VGPR->AGPR copy. - CopyToVGPR = Src; - } else { - B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, - Src.SubReg); - } - } else { - assert(Def->isReg()); - Def->setIsKill(false); - auto Src = getRegSubRegPair(*Def); - - // Direct copy from SGPR to AGPR is not possible. To avoid creation - // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, - // create a copy here and track if we already have such a copy. - if (TRI->isSGPRReg(*MRI, Src.Reg)) { - CopyToVGPR = Src; - } else { - auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); - BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); - B.addReg(Tmp); - } - } - - if (CopyToVGPR.Reg) { - auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR); - Register &Vgpr = It->second; - if (Inserted) { - Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); - } - auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); - BuildMI(MBB, UseMI, DL, - TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr); - B.addReg(Tmp); - } - - B.addImm(Defs[I].second); - } - LLVM_DEBUG(dbgs() << "Folded " << *UseMI); - } - - return; + MRI->clearKillFlags(UseReg); + if (foldCopyToAGPRRegSequence(UseMI)) + return; } unsigned UseOpc = UseMI->getOpcode(); @@ -1558,6 +1484,88 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, return true; } +/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE +/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values. +bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const { + // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can + // only accept VGPR or inline immediate. Recreate a reg_sequence with its + // initializers right here, so we will rematerialize immediates and avoid + // copies via different reg classes. + if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg())) + return false; + Register UseReg = CopyMI->getOperand(1).getReg(); + SmallVector, 32> Defs; + if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) + return false; + + const DebugLoc &DL = CopyMI->getDebugLoc(); + MachineBasicBlock &MBB = *CopyMI->getParent(); + + CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); + for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I) + CopyMI->removeOperand(I); + + MachineInstrBuilder B(*MBB.getParent(), CopyMI); + DenseMap VGPRCopies; + SmallSetVector SeenAGPRs; + for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) { + MachineOperand *Def = Defs[I].first; + TargetInstrInfo::RegSubRegPair CopyToVGPR; + if (Def->isImm() && + TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + int64_t Imm = Def->getImm(); + + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp) + .addImm(Imm); + B.addReg(Tmp); + } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { + auto Src = getRegSubRegPair(*Def); + Def->setIsKill(false); + if (!SeenAGPRs.insert(Src)) { + // We cannot build a reg_sequence out of the same registers, they + // must be copied. Better do it here before copyPhysReg() created + // several reads to do the AGPR->VGPR->AGPR copy. + CopyToVGPR = Src; + } else { + B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg); + } + } else { + assert(Def->isReg()); + Def->setIsKill(false); + auto Src = getRegSubRegPair(*Def); + + // Direct copy from SGPR to AGPR is not possible. To avoid creation + // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, + // create a copy here and track if we already have such a copy. + if (TRI->isSGPRReg(*MRI, Src.Reg)) { + CopyToVGPR = Src; + } else { + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); + B.addReg(Tmp); + } + } + + if (CopyToVGPR.Reg) { + auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR); + Register &Vgpr = It->second; + if (Inserted) { + Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); + } + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp) + .addReg(Vgpr); + B.addReg(Tmp); + } + + B.addImm(Defs[I].second); + } + LLVM_DEBUG(dbgs() << "Folded " << *CopyMI); + return true; +} + bool SIFoldOperandsImpl::tryFoldFoldableCopy( MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const { Register DstReg = MI.getOperand(0).getReg();