@@ -128,6 +128,8 @@ class SIFoldOperandsImpl {
128
128
bool tryFoldCndMask (MachineInstr &MI) const ;
129
129
bool tryFoldZeroHighBits (MachineInstr &MI) const ;
130
130
bool foldInstOperand (MachineInstr &MI, MachineOperand &OpToFold) const ;
131
+
132
+ bool foldCopyToAGPRRegSequence (MachineInstr *CopyMI) const ;
131
133
bool tryFoldFoldableCopy (MachineInstr &MI,
132
134
MachineOperand *&CurrentKnownM0Val) const ;
133
135
@@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand(
1012
1014
UseMI->getOperand (0 ).getReg ().isVirtual () &&
1013
1015
!UseMI->getOperand (1 ).getSubReg ()) {
1014
1016
LLVM_DEBUG (dbgs () << " Folding " << OpToFold << " \n into " << *UseMI);
1015
- unsigned Size = TII->getOpSize (*UseMI, 1 );
1016
1017
Register UseReg = OpToFold.getReg ();
1017
1018
UseMI->getOperand (1 ).setReg (UseReg);
1018
1019
UseMI->getOperand (1 ).setSubReg (OpToFold.getSubReg ());
@@ -1021,84 +1022,9 @@ void SIFoldOperandsImpl::foldOperand(
1021
1022
OpToFold.setIsKill (false );
1022
1023
1023
1024
// Remove kill flags as kills may now be out of order with uses.
1024
- MRI->clearKillFlags (OpToFold.getReg ());
1025
-
1026
- // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
1027
- // can only accept VGPR or inline immediate. Recreate a reg_sequence with
1028
- // its initializers right here, so we will rematerialize immediates and
1029
- // avoid copies via different reg classes.
1030
- SmallVector<std::pair<MachineOperand*, unsigned >, 32 > Defs;
1031
- if (Size > 4 && TRI->isAGPR (*MRI, UseMI->getOperand (0 ).getReg ()) &&
1032
- getRegSeqInit (Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1033
- const DebugLoc &DL = UseMI->getDebugLoc ();
1034
- MachineBasicBlock &MBB = *UseMI->getParent ();
1035
-
1036
- UseMI->setDesc (TII->get (AMDGPU::REG_SEQUENCE));
1037
- for (unsigned I = UseMI->getNumOperands () - 1 ; I > 0 ; --I)
1038
- UseMI->removeOperand (I);
1039
-
1040
- MachineInstrBuilder B (*MBB.getParent (), UseMI);
1041
- DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1042
- SmallSetVector<TargetInstrInfo::RegSubRegPair, 32 > SeenAGPRs;
1043
- for (unsigned I = 0 ; I < Size / 4 ; ++I) {
1044
- MachineOperand *Def = Defs[I].first ;
1045
- TargetInstrInfo::RegSubRegPair CopyToVGPR;
1046
- if (Def->isImm () &&
1047
- TII->isInlineConstant (*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1048
- int64_t Imm = Def->getImm ();
1049
-
1050
- auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1051
- BuildMI (MBB, UseMI, DL,
1052
- TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm (Imm);
1053
- B.addReg (Tmp);
1054
- } else if (Def->isReg () && TRI->isAGPR (*MRI, Def->getReg ())) {
1055
- auto Src = getRegSubRegPair (*Def);
1056
- Def->setIsKill (false );
1057
- if (!SeenAGPRs.insert (Src)) {
1058
- // We cannot build a reg_sequence out of the same registers, they
1059
- // must be copied. Better do it here before copyPhysReg() created
1060
- // several reads to do the AGPR->VGPR->AGPR copy.
1061
- CopyToVGPR = Src;
1062
- } else {
1063
- B.addReg (Src.Reg , Def->isUndef () ? RegState::Undef : 0 ,
1064
- Src.SubReg );
1065
- }
1066
- } else {
1067
- assert (Def->isReg ());
1068
- Def->setIsKill (false );
1069
- auto Src = getRegSubRegPair (*Def);
1070
-
1071
- // Direct copy from SGPR to AGPR is not possible. To avoid creation
1072
- // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1073
- // create a copy here and track if we already have such a copy.
1074
- if (TRI->isSGPRReg (*MRI, Src.Reg )) {
1075
- CopyToVGPR = Src;
1076
- } else {
1077
- auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1078
- BuildMI (MBB, UseMI, DL, TII->get (AMDGPU::COPY), Tmp).add (*Def);
1079
- B.addReg (Tmp);
1080
- }
1081
- }
1082
-
1083
- if (CopyToVGPR.Reg ) {
1084
- auto [It, Inserted] = VGPRCopies.try_emplace (CopyToVGPR);
1085
- Register &Vgpr = It->second ;
1086
- if (Inserted) {
1087
- Vgpr = MRI->createVirtualRegister (&AMDGPU::VGPR_32RegClass);
1088
- BuildMI (MBB, UseMI, DL, TII->get (AMDGPU::COPY), Vgpr).add (*Def);
1089
- }
1090
- auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1091
- BuildMI (MBB, UseMI, DL,
1092
- TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg (Vgpr);
1093
- B.addReg (Tmp);
1094
- }
1095
-
1096
- B.addImm (Defs[I].second );
1097
- }
1098
- LLVM_DEBUG (dbgs () << " Folded " << *UseMI);
1099
- }
1100
-
1101
- return ;
1025
+ MRI->clearKillFlags (UseReg);
1026
+ if (foldCopyToAGPRRegSequence (UseMI))
1027
+ return ;
1102
1028
}
1103
1029
1104
1030
unsigned UseOpc = UseMI->getOpcode ();
@@ -1558,6 +1484,88 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1558
1484
return true ;
1559
1485
}
1560
1486
1487
+ // / Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1488
+ // / (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1489
+ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence (MachineInstr *CopyMI) const {
1490
+ // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1491
+ // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1492
+ // initializers right here, so we will rematerialize immediates and avoid
1493
+ // copies via different reg classes.
1494
+ if (!TRI->isAGPR (*MRI, CopyMI->getOperand (0 ).getReg ()))
1495
+ return false ;
1496
+ Register UseReg = CopyMI->getOperand (1 ).getReg ();
1497
+ SmallVector<std::pair<MachineOperand *, unsigned >, 32 > Defs;
1498
+ if (!getRegSeqInit (Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
1499
+ return false ;
1500
+
1501
+ const DebugLoc &DL = CopyMI->getDebugLoc ();
1502
+ MachineBasicBlock &MBB = *CopyMI->getParent ();
1503
+
1504
+ CopyMI->setDesc (TII->get (AMDGPU::REG_SEQUENCE));
1505
+ for (unsigned I = CopyMI->getNumOperands () - 1 ; I > 0 ; --I)
1506
+ CopyMI->removeOperand (I);
1507
+
1508
+ MachineInstrBuilder B (*MBB.getParent (), CopyMI);
1509
+ DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1510
+ SmallSetVector<TargetInstrInfo::RegSubRegPair, 32 > SeenAGPRs;
1511
+ for (unsigned I = 0 , NumElts = Defs.size (); I != NumElts; ++I) {
1512
+ MachineOperand *Def = Defs[I].first ;
1513
+ TargetInstrInfo::RegSubRegPair CopyToVGPR;
1514
+ if (Def->isImm () &&
1515
+ TII->isInlineConstant (*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1516
+ int64_t Imm = Def->getImm ();
1517
+
1518
+ auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1519
+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1520
+ .addImm (Imm);
1521
+ B.addReg (Tmp);
1522
+ } else if (Def->isReg () && TRI->isAGPR (*MRI, Def->getReg ())) {
1523
+ auto Src = getRegSubRegPair (*Def);
1524
+ Def->setIsKill (false );
1525
+ if (!SeenAGPRs.insert (Src)) {
1526
+ // We cannot build a reg_sequence out of the same registers, they
1527
+ // must be copied. Better do it here before copyPhysReg() created
1528
+ // several reads to do the AGPR->VGPR->AGPR copy.
1529
+ CopyToVGPR = Src;
1530
+ } else {
1531
+ B.addReg (Src.Reg , Def->isUndef () ? RegState::Undef : 0 , Src.SubReg );
1532
+ }
1533
+ } else {
1534
+ assert (Def->isReg ());
1535
+ Def->setIsKill (false );
1536
+ auto Src = getRegSubRegPair (*Def);
1537
+
1538
+ // Direct copy from SGPR to AGPR is not possible. To avoid creation
1539
+ // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1540
+ // create a copy here and track if we already have such a copy.
1541
+ if (TRI->isSGPRReg (*MRI, Src.Reg )) {
1542
+ CopyToVGPR = Src;
1543
+ } else {
1544
+ auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1545
+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::COPY), Tmp).add (*Def);
1546
+ B.addReg (Tmp);
1547
+ }
1548
+ }
1549
+
1550
+ if (CopyToVGPR.Reg ) {
1551
+ auto [It, Inserted] = VGPRCopies.try_emplace (CopyToVGPR);
1552
+ Register &Vgpr = It->second ;
1553
+ if (Inserted) {
1554
+ Vgpr = MRI->createVirtualRegister (&AMDGPU::VGPR_32RegClass);
1555
+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::COPY), Vgpr).add (*Def);
1556
+ }
1557
+ auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1558
+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1559
+ .addReg (Vgpr);
1560
+ B.addReg (Tmp);
1561
+ }
1562
+
1563
+ B.addImm (Defs[I].second );
1564
+ }
1565
+ LLVM_DEBUG (dbgs () << " Folded " << *CopyMI);
1566
+ return true ;
1567
+ }
1568
+
1561
1569
bool SIFoldOperandsImpl::tryFoldFoldableCopy (
1562
1570
MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1563
1571
Register DstReg = MI.getOperand (0 ).getReg ();
0 commit comments