Skip to content

Commit f17b43e

Browse files
committed
[AMDGPU][NFCI] Reorder AGPRs to allow skipping over them
Inspired by llvm#70222 Move the AGPRs at the end of the register file to allow reordering them. When building functions that don't have AGPRs, this removes almost 4000 registers out of ~9000. Brings a small (+-1%) compile time speedup in a large IR file I tried. `getNumSupportedRegs` is still under-used and I suspect most uses of `getNumRegs()` could be replaced to bring even more improvements both in compile time (less iterations) and memory usage (bitvectors/arrays can be smaller).
1 parent 6b8c194 commit f17b43e

14 files changed

+127
-101
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3365,6 +3365,24 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
33653365
return nullptr;
33663366
}
33673367

3368+
unsigned SIRegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
3369+
#ifndef NDEBUG
3370+
for (unsigned K = AMDGPU::AGPR0; K < AMDGPU::NUM_TARGET_REGS; ++K) {
3371+
// Skip lo16 registers, they're "fake" and don't have a regclass assigned.
3372+
if (K >= AMDGPU::AGPR0_HI16 && K <= AMDGPU::AGPR255_HI16)
3373+
continue;
3374+
if (!isAGPR(MF.getRegInfo(), K))
3375+
report_fatal_error("register at index " + Twine(K) + " is not an AGPR!");
3376+
}
3377+
#endif
3378+
3379+
// Don't include AGPRs on targets that don't have them.
3380+
// This cuts about 4000 register (almost half of all registers) off.
3381+
return MF.getInfo<SIMachineFunctionInfo>()->mayUseAGPRs(MF.getFunction())
3382+
? AMDGPU::NUM_TARGET_REGS
3383+
: AMDGPU::AGPR0;
3384+
}
3385+
33683386
MCRegister SIRegisterInfo::getVCC() const {
33693387
return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
33703388
}
@@ -3456,9 +3474,7 @@ MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
34563474
AMDGPU::AGPR_32RegClass } ) {
34573475
if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
34583476
return Super;
3459-
}
3460-
if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3461-
&AMDGPU::VGPR_32RegClass)) {
3477+
if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, &RC))
34623478
return Super;
34633479
}
34643480

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
342342
getConstrainedRegClassForOperand(const MachineOperand &MO,
343343
const MachineRegisterInfo &MRI) const override;
344344

345+
unsigned getNumSupportedRegs(const MachineFunction &MF) const override;
346+
345347
const TargetRegisterClass *getBoolRC() const {
346348
return isWave32 ? &AMDGPU::SReg_32RegClass
347349
: &AMDGPU::SReg_64RegClass;

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,13 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
159159

160160
multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
161161
bit isVGPR = 0, bit isAGPR = 0> {
162-
def _LO16 : SIReg<n#".l", regIdx, isVGPR, isAGPR>;
162+
def _LO16 : SIReg<n#".l", regIdx, isVGPR, isAGPR> {
163+
let PositionOrder = !if(isAGPR, 1, 0);
164+
}
163165
def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx, isVGPR, isAGPR,
164166
/* isHi16 */ 1> {
165167
let isArtificial = ArtificialHigh;
168+
let PositionOrder = !if(isAGPR, 1, 0);
166169
}
167170
def "" : RegisterWithSubRegs<n, [!cast<Register>(NAME#"_LO16"),
168171
!cast<Register>(NAME#"_HI16")]> {
@@ -174,6 +177,7 @@ multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
174177
let HWEncoding{9} = isAGPR;
175178

176179
int Index = !cast<int>(regIdx);
180+
let PositionOrder = !if(isAGPR, 1, 0);
177181
}
178182
}
179183

@@ -699,6 +703,8 @@ def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16,
699703
}
700704
} // End HasAGPR = 1
701705

706+
let PositionOrder = 1 in {
707+
702708
// AGPR 64-bit registers
703709
def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">;
704710

@@ -737,6 +743,7 @@ def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">;
737743

738744
// AGPR 1024-bit registers
739745
def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
746+
}
740747

741748
//===----------------------------------------------------------------------===//
742749
// Register classes used as source and destination

llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir

Lines changed: 30 additions & 30 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll

Lines changed: 42 additions & 42 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ body: |
421421
; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
422422
;
423423
; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
424-
; GFX90A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
424+
; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63, $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
425425
; GFX90A-NEXT: {{ $}}
426426
; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
427427
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec

llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ body: |
3636
; CHECK-NEXT: successors: %bb.3(0x80000000)
3737
; CHECK-NEXT: {{ $}}
3838
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
39-
; CHECK-NEXT: internal [[COPY1]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024_align2 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
40-
; CHECK-NEXT: internal [[COPY1]].sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub29_sub30_sub31
39+
; CHECK-NEXT: internal [[COPY1]].sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16_sub29_lo16_sub29_hi16_sub30_lo16_sub30_hi16_sub31_lo16_sub31_hi16:av_1024_align2 = COPY [[COPY]].sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16_sub29_lo16_sub29_hi16_sub30_lo16_sub30_hi16_sub31_lo16_sub31_hi16
40+
; CHECK-NEXT: internal [[COPY1]].sub17_sub18_sub19:av_1024_align2 = COPY [[COPY]].sub17_sub18_sub19
4141
; CHECK-NEXT: }
4242
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub0:av_1024_align2 = IMPLICIT_DEF
4343
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]].sub0

llvm/test/CodeGen/AMDGPU/frame-index.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,7 @@ body: |
518518
; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
519519
;
520520
; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
521-
; GFX90A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
521+
; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
522522
; GFX90A-NEXT: {{ $}}
523523
; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
524524
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
@@ -732,7 +732,7 @@ body: |
732732
; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
733733
;
734734
; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
735-
; GFX90A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
735+
; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63, $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
736736
; GFX90A-NEXT: {{ $}}
737737
; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
738738
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec

llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
3030
;
3131
; PEI-GFX908-LABEL: name: partial_copy
3232
; PEI-GFX908: bb.0 (%ir-block.0):
33-
; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
33+
; PEI-GFX908-NEXT: liveins: $sgpr4_sgpr5, $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
3434
; PEI-GFX908-NEXT: {{ $}}
3535
; PEI-GFX908-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
3636
; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
@@ -77,7 +77,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
7777
;
7878
; PEI-GFX90A-LABEL: name: partial_copy
7979
; PEI-GFX90A: bb.0 (%ir-block.0):
80-
; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
80+
; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5, $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
8181
; PEI-GFX90A-NEXT: {{ $}}
8282
; PEI-GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
8383
; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11

llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ body: |
3434
liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
3535
3636
; GFX908-LABEL: name: regalloc_introduces_s_to_a_copy
37-
; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, $vgpr32_vgpr33_vgpr34_vgpr35, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
37+
; GFX908: liveins: $vgpr32_vgpr33_vgpr34_vgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
3838
; GFX908-NEXT: {{ $}}
3939
; GFX908-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr7, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
4040
; GFX908-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3

0 commit comments

Comments
 (0)