Skip to content

Commit bcae30e

Browse files
committed
GlobalISel: Use G_UADDE when narrowing G_UMULH
This greatly shrinks the AMDGPU div64 expansion. Instead of adding a zext of the condition output, add a zero and use the carry in to G_UADDE. This is closer to how the DAG expansion using umulh does it, and it seems more natural to leave the boolean output as a boolean input. We should have a combine to form G_UADDE from this pattern, but the legalizer shouldn't create extra work for the combiner if it can help it. The Mips cases are regressions, but the DAG lowering for muli128 seems to not use the expansion involving MULHU/MULHS at all. The DAG output is radically different than GlobalISel as-is, so it seems like Mips should be using a different legalization strategy here to begin with. The RISCV legalizer tests look worse for the mul i96 case, but those didn't exist when I wrote this patch and forgot about it 4 years ago, so I haven't really looked into why. We've entered the age where most tests should just be using IR, so I don't know if this matters or not (the IR mul test doesn't seem to cover i96)
1 parent 56a636f commit bcae30e

18 files changed

+10103
-12260
lines changed

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5716,6 +5716,7 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
57165716
ArrayRef<Register> Src1Regs,
57175717
ArrayRef<Register> Src2Regs,
57185718
LLT NarrowTy) {
5719+
const LLT S1 = LLT::scalar(1);
57195720
MachineIRBuilder &B = MIRBuilder;
57205721
unsigned SrcParts = Src1Regs.size();
57215722
unsigned DstParts = DstRegs.size();
@@ -5728,6 +5729,8 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
57285729
unsigned CarrySumPrevDstIdx;
57295730
SmallVector<Register, 4> Factors;
57305731

5732+
const Register Zero = B.buildConstant(NarrowTy, 0).getReg(0);
5733+
57315734
for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
57325735
// Collect low parts of muls for DstIdx.
57335736
for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
@@ -5752,15 +5755,15 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
57525755
// Add all factors and accumulate all carries into CarrySum.
57535756
if (DstIdx != DstParts - 1) {
57545757
MachineInstrBuilder Uaddo =
5755-
B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5758+
B.buildUAddo(NarrowTy, S1, Factors[0], Factors[1]);
57565759
FactorSum = Uaddo.getReg(0);
5757-
CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5760+
CarrySum = Zero;
57585761
for (unsigned i = 2; i < Factors.size(); ++i) {
5759-
MachineInstrBuilder Uaddo =
5760-
B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5761-
FactorSum = Uaddo.getReg(0);
5762-
MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5763-
CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5762+
auto Uadde =
5763+
B.buildUAdde(NarrowTy, S1, FactorSum, Factors[i], Uaddo.getReg(1));
5764+
FactorSum = Uadde.getReg(0);
5765+
CarrySum = B.buildUAdde(NarrowTy, S1, CarrySum, Zero, Uadde.getReg(1))
5766+
.getReg(0);
57645767
}
57655768
} else {
57665769
// Since value for the next index is not calculated, neither is CarrySum.

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -619,25 +619,24 @@ body: |
619619
; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
620620
; GFX6-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96)
621621
; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
622+
; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
622623
; GFX6-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]]
623624
; GFX6-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV4]]
624625
; GFX6-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV3]]
625626
; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[MUL1]], [[MUL2]]
626-
; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO1]](s1)
627-
; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[UMULH]]
628-
; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO3]](s1)
629-
; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[ZEXT1]]
627+
; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UADDO]], [[UMULH]], [[UADDO1]]
628+
; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDE1]]
630629
; GFX6-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV3]]
631630
; GFX6-NEXT: [[MUL4:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV4]]
632631
; GFX6-NEXT: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]]
633632
; GFX6-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV3]]
634633
; GFX6-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV4]]
635-
; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]]
636-
; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[MUL5]]
637-
; GFX6-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[UMULH1]]
638-
; GFX6-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UMULH2]]
639-
; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD]]
640-
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[ADD5]](s32)
634+
; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]]
635+
; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL5]]
636+
; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[UMULH1]]
637+
; GFX6-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[UMULH2]]
638+
; GFX6-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UADDE2]]
639+
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDE]](s32), [[ADD4]](s32)
641640
; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96)
642641
;
643642
; GFX89-LABEL: name: test_mul_s96

0 commit comments

Comments
 (0)