Skip to content

Commit 2605280

Browse files
committed
GlobalISel: Use G_UADDE when narrowing G_UMULH
This greatly shrinks the AMDGPU div64 expansion. Instead of adding a zext of the condition output, add a zero and use the carry in to G_UADDE. This is closer to how the DAG expansion using umulh does it, and it seems more natural to leave the boolean output as a boolean input. We should have a combine to form G_UADDE from this pattern, but the legalizer shouldn't create extra work for the combiner if it can help it. The Mips cases are regressions, but the DAG lowering for muli128 seems to not use the expansion involving MULHU/MULHS at all. The DAG output is radically different than GlobalISel as-is, so it seems like Mips should be using a different legalization strategy here to begin with. The RISCV legalizer tests look worse for the mul i96 case, but those didn't exist when I wrote this patch and forgot about it 4 years ago, so I haven't really looked into why. We've entered the age where most tests should just be using IR, so I don't know if this matters or not (the IR mul test doesn't seem to cover i96)
1 parent e10d4e8 commit 2605280

File tree

19 files changed

+10181
-12360
lines changed

19 files changed

+10181
-12360
lines changed

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5719,6 +5719,7 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
57195719
ArrayRef<Register> Src1Regs,
57205720
ArrayRef<Register> Src2Regs,
57215721
LLT NarrowTy) {
5722+
const LLT S1 = LLT::scalar(1);
57225723
MachineIRBuilder &B = MIRBuilder;
57235724
unsigned SrcParts = Src1Regs.size();
57245725
unsigned DstParts = DstRegs.size();
@@ -5731,6 +5732,8 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
57315732
unsigned CarrySumPrevDstIdx;
57325733
SmallVector<Register, 4> Factors;
57335734

5735+
const Register Zero = B.buildConstant(NarrowTy, 0).getReg(0);
5736+
57345737
for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
57355738
// Collect low parts of muls for DstIdx.
57365739
for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
@@ -5755,15 +5758,15 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
57555758
// Add all factors and accumulate all carries into CarrySum.
57565759
if (DstIdx != DstParts - 1) {
57575760
MachineInstrBuilder Uaddo =
5758-
B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5761+
B.buildUAddo(NarrowTy, S1, Factors[0], Factors[1]);
57595762
FactorSum = Uaddo.getReg(0);
5760-
CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5763+
CarrySum = Zero;
57615764
for (unsigned i = 2; i < Factors.size(); ++i) {
5762-
MachineInstrBuilder Uaddo =
5763-
B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5764-
FactorSum = Uaddo.getReg(0);
5765-
MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5766-
CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5765+
auto Uadde =
5766+
B.buildUAdde(NarrowTy, S1, FactorSum, Factors[i], Uaddo.getReg(1));
5767+
FactorSum = Uadde.getReg(0);
5768+
CarrySum = B.buildUAdde(NarrowTy, S1, CarrySum, Zero, Uadde.getReg(1))
5769+
.getReg(0);
57675770
}
57685771
} else {
57695772
// Since value for the next index is not calculated, neither is CarrySum.

llvm/test/CodeGen/AArch64/GlobalISel/mul.ll

Lines changed: 78 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -68,26 +68,25 @@ define i128 @mul_i128(i128 %x, i128 %y) {
6868
define i160 @mul_i160(i160 %x, i160 %y) {
6969
; CHECK-LABEL: mul_i160:
7070
; CHECK: // %bb.0:
71-
; CHECK-NEXT: mul x9, x1, x4
72-
; CHECK-NEXT: mul x10, x1, x5
73-
; CHECK-NEXT: mul x11, x0, x5
74-
; CHECK-NEXT: umulh x12, x0, x4
75-
; CHECK-NEXT: madd x10, x2, x4, x10
76-
; CHECK-NEXT: adds x9, x9, x11
77-
; CHECK-NEXT: umulh x13, x1, x4
71+
; CHECK-NEXT: mul x8, x1, x4
72+
; CHECK-NEXT: mul x9, x0, x5
73+
; CHECK-NEXT: umulh x10, x0, x4
74+
; CHECK-NEXT: mul x11, x2, x4
75+
; CHECK-NEXT: adds x8, x8, x9
76+
; CHECK-NEXT: mul x12, x1, x5
77+
; CHECK-NEXT: mul x13, x0, x6
78+
; CHECK-NEXT: umulh x14, x1, x4
79+
; CHECK-NEXT: adcs x1, x8, x10
80+
; CHECK-NEXT: adc x9, xzr, xzr
81+
; CHECK-NEXT: adds x10, x11, x12
82+
; CHECK-NEXT: umulh x8, x0, x5
7883
; CHECK-NEXT: cset w11, hs
79-
; CHECK-NEXT: adds x1, x9, x12
80-
; CHECK-NEXT: and x11, x11, #0x1
81-
; CHECK-NEXT: umulh x14, x0, x5
82-
; CHECK-NEXT: mul x8, x0, x4
83-
; CHECK-NEXT: madd x9, x0, x6, x10
84-
; CHECK-NEXT: cset w10, hs
85-
; CHECK-NEXT: and x10, x10, #0x1
86-
; CHECK-NEXT: add x10, x11, x10
87-
; CHECK-NEXT: add x11, x13, x14
88-
; CHECK-NEXT: add x10, x11, x10
89-
; CHECK-NEXT: mov x0, x8
90-
; CHECK-NEXT: add x2, x9, x10
84+
; CHECK-NEXT: adc x10, x10, x13
85+
; CHECK-NEXT: cmp w11, #1
86+
; CHECK-NEXT: mul x0, x0, x4
87+
; CHECK-NEXT: adc x10, x10, x14
88+
; CHECK-NEXT: adc x8, x10, x8
89+
; CHECK-NEXT: adc x2, x8, x9
9190
; CHECK-NEXT: ret
9291
%mul = mul i160 %x, %y
9392
ret i160 %mul
@@ -96,26 +95,25 @@ define i160 @mul_i160(i160 %x, i160 %y) {
9695
define i192 @mul_i192(i192 %x, i192 %y) {
9796
; CHECK-LABEL: mul_i192:
9897
; CHECK: // %bb.0:
99-
; CHECK-NEXT: mul x9, x1, x4
100-
; CHECK-NEXT: mul x10, x1, x5
101-
; CHECK-NEXT: mul x11, x0, x5
102-
; CHECK-NEXT: umulh x12, x0, x4
103-
; CHECK-NEXT: madd x10, x2, x4, x10
104-
; CHECK-NEXT: adds x9, x9, x11
105-
; CHECK-NEXT: umulh x13, x1, x4
98+
; CHECK-NEXT: mul x8, x1, x4
99+
; CHECK-NEXT: mul x9, x0, x5
100+
; CHECK-NEXT: umulh x10, x0, x4
101+
; CHECK-NEXT: mul x11, x2, x4
102+
; CHECK-NEXT: adds x8, x8, x9
103+
; CHECK-NEXT: mul x12, x1, x5
104+
; CHECK-NEXT: mul x13, x0, x6
105+
; CHECK-NEXT: umulh x14, x1, x4
106+
; CHECK-NEXT: adcs x1, x8, x10
107+
; CHECK-NEXT: adc x9, xzr, xzr
108+
; CHECK-NEXT: adds x10, x11, x12
109+
; CHECK-NEXT: umulh x8, x0, x5
106110
; CHECK-NEXT: cset w11, hs
107-
; CHECK-NEXT: adds x1, x9, x12
108-
; CHECK-NEXT: and x11, x11, #0x1
109-
; CHECK-NEXT: umulh x14, x0, x5
110-
; CHECK-NEXT: mul x8, x0, x4
111-
; CHECK-NEXT: madd x9, x0, x6, x10
112-
; CHECK-NEXT: cset w10, hs
113-
; CHECK-NEXT: and x10, x10, #0x1
114-
; CHECK-NEXT: add x10, x11, x10
115-
; CHECK-NEXT: add x11, x13, x14
116-
; CHECK-NEXT: add x10, x11, x10
117-
; CHECK-NEXT: mov x0, x8
118-
; CHECK-NEXT: add x2, x9, x10
111+
; CHECK-NEXT: adc x10, x10, x13
112+
; CHECK-NEXT: cmp w11, #1
113+
; CHECK-NEXT: mul x0, x0, x4
114+
; CHECK-NEXT: adc x10, x10, x14
115+
; CHECK-NEXT: adc x8, x10, x8
116+
; CHECK-NEXT: adc x2, x8, x9
119117
; CHECK-NEXT: ret
120118
%mul = mul i192 %x, %y
121119
ret i192 %mul
@@ -127,50 +125,40 @@ define i224 @mul_i224(i224 %x, i224 %y) {
127125
; CHECK-NEXT: mul x8, x1, x4
128126
; CHECK-NEXT: mul x9, x0, x5
129127
; CHECK-NEXT: umulh x10, x0, x4
130-
; CHECK-NEXT: mul x14, x2, x5
131-
; CHECK-NEXT: adds x8, x8, x9
132128
; CHECK-NEXT: mul x11, x2, x4
133-
; CHECK-NEXT: cset w9, hs
134-
; CHECK-NEXT: adds x8, x8, x10
135-
; CHECK-NEXT: and x9, x9, #0x1
129+
; CHECK-NEXT: adds x8, x8, x9
136130
; CHECK-NEXT: mul x12, x1, x5
137-
; CHECK-NEXT: cset w10, hs
138-
; CHECK-NEXT: and x10, x10, #0x1
131+
; CHECK-NEXT: adcs x8, x8, x10
132+
; CHECK-NEXT: mul x14, x2, x5
133+
; CHECK-NEXT: adc x10, xzr, xzr
139134
; CHECK-NEXT: mul x13, x0, x6
140-
; CHECK-NEXT: add x9, x9, x10
141-
; CHECK-NEXT: umulh x15, x1, x4
142135
; CHECK-NEXT: adds x11, x11, x12
136+
; CHECK-NEXT: umulh x15, x1, x4
143137
; CHECK-NEXT: madd x14, x3, x4, x14
144138
; CHECK-NEXT: umulh x16, x0, x5
145139
; CHECK-NEXT: madd x12, x1, x6, x14
146140
; CHECK-NEXT: cset w14, hs
147-
; CHECK-NEXT: adds x11, x11, x13
148-
; CHECK-NEXT: cset w13, hs
149-
; CHECK-NEXT: adds x11, x11, x15
150-
; CHECK-NEXT: and x14, x14, #0x1
141+
; CHECK-NEXT: adcs x11, x11, x13
142+
; CHECK-NEXT: adc x13, xzr, xzr
143+
; CHECK-NEXT: cmp w14, #1
151144
; CHECK-NEXT: umulh x17, x2, x4
152-
; CHECK-NEXT: and x13, x13, #0x1
153-
; CHECK-NEXT: add x13, x14, x13
154-
; CHECK-NEXT: umulh x18, x1, x5
155-
; CHECK-NEXT: cset w1, hs
156-
; CHECK-NEXT: adds x11, x11, x16
157-
; CHECK-NEXT: and x14, x1, #0x1
145+
; CHECK-NEXT: adcs x11, x11, x15
146+
; CHECK-NEXT: adc x13, x13, xzr
147+
; CHECK-NEXT: cmp w14, #1
148+
; CHECK-NEXT: umulh x9, x1, x5
149+
; CHECK-NEXT: adcs x11, x11, x16
158150
; CHECK-NEXT: mov x1, x8
159-
; CHECK-NEXT: umulh x15, x0, x6
160-
; CHECK-NEXT: add x10, x13, x14
161-
; CHECK-NEXT: cset w13, hs
162-
; CHECK-NEXT: adds x2, x11, x9
163-
; CHECK-NEXT: and x9, x13, #0x1
151+
; CHECK-NEXT: adc x13, x13, xzr
152+
; CHECK-NEXT: cmp w14, #1
153+
; CHECK-NEXT: umulh x18, x0, x6
154+
; CHECK-NEXT: adcs x2, x11, x10
155+
; CHECK-NEXT: adc x10, x13, xzr
164156
; CHECK-NEXT: madd x12, x0, x7, x12
165-
; CHECK-NEXT: cset w11, hs
166-
; CHECK-NEXT: add x9, x10, x9
167-
; CHECK-NEXT: add x13, x17, x18
168-
; CHECK-NEXT: and x10, x11, #0x1
157+
; CHECK-NEXT: add x9, x17, x9
169158
; CHECK-NEXT: mul x0, x0, x4
159+
; CHECK-NEXT: add x9, x9, x18
170160
; CHECK-NEXT: add x9, x9, x10
171-
; CHECK-NEXT: add x11, x13, x15
172-
; CHECK-NEXT: add x10, x12, x11
173-
; CHECK-NEXT: add x3, x10, x9
161+
; CHECK-NEXT: add x3, x12, x9
174162
; CHECK-NEXT: ret
175163
%mul = mul i224 %x, %y
176164
ret i224 %mul
@@ -182,50 +170,40 @@ define i256 @mul_i256(i256 %x, i256 %y) {
182170
; CHECK-NEXT: mul x8, x1, x4
183171
; CHECK-NEXT: mul x9, x0, x5
184172
; CHECK-NEXT: umulh x10, x0, x4
185-
; CHECK-NEXT: mul x14, x2, x5
186-
; CHECK-NEXT: adds x8, x8, x9
187173
; CHECK-NEXT: mul x11, x2, x4
188-
; CHECK-NEXT: cset w9, hs
189-
; CHECK-NEXT: adds x8, x8, x10
190-
; CHECK-NEXT: and x9, x9, #0x1
174+
; CHECK-NEXT: adds x8, x8, x9
191175
; CHECK-NEXT: mul x12, x1, x5
192-
; CHECK-NEXT: cset w10, hs
193-
; CHECK-NEXT: and x10, x10, #0x1
176+
; CHECK-NEXT: adcs x8, x8, x10
177+
; CHECK-NEXT: mul x14, x2, x5
178+
; CHECK-NEXT: adc x10, xzr, xzr
194179
; CHECK-NEXT: mul x13, x0, x6
195-
; CHECK-NEXT: add x9, x9, x10
196-
; CHECK-NEXT: umulh x15, x1, x4
197180
; CHECK-NEXT: adds x11, x11, x12
181+
; CHECK-NEXT: umulh x15, x1, x4
198182
; CHECK-NEXT: madd x14, x3, x4, x14
199183
; CHECK-NEXT: umulh x16, x0, x5
200184
; CHECK-NEXT: madd x12, x1, x6, x14
201185
; CHECK-NEXT: cset w14, hs
202-
; CHECK-NEXT: adds x11, x11, x13
203-
; CHECK-NEXT: cset w13, hs
204-
; CHECK-NEXT: adds x11, x11, x15
205-
; CHECK-NEXT: and x14, x14, #0x1
186+
; CHECK-NEXT: adcs x11, x11, x13
187+
; CHECK-NEXT: adc x13, xzr, xzr
188+
; CHECK-NEXT: cmp w14, #1
206189
; CHECK-NEXT: umulh x17, x2, x4
207-
; CHECK-NEXT: and x13, x13, #0x1
208-
; CHECK-NEXT: add x13, x14, x13
209-
; CHECK-NEXT: umulh x18, x1, x5
210-
; CHECK-NEXT: cset w1, hs
211-
; CHECK-NEXT: adds x11, x11, x16
212-
; CHECK-NEXT: and x14, x1, #0x1
190+
; CHECK-NEXT: adcs x11, x11, x15
191+
; CHECK-NEXT: adc x13, x13, xzr
192+
; CHECK-NEXT: cmp w14, #1
193+
; CHECK-NEXT: umulh x9, x1, x5
194+
; CHECK-NEXT: adcs x11, x11, x16
213195
; CHECK-NEXT: mov x1, x8
214-
; CHECK-NEXT: umulh x15, x0, x6
215-
; CHECK-NEXT: add x10, x13, x14
216-
; CHECK-NEXT: cset w13, hs
217-
; CHECK-NEXT: adds x2, x11, x9
218-
; CHECK-NEXT: and x9, x13, #0x1
196+
; CHECK-NEXT: adc x13, x13, xzr
197+
; CHECK-NEXT: cmp w14, #1
198+
; CHECK-NEXT: umulh x18, x0, x6
199+
; CHECK-NEXT: adcs x2, x11, x10
200+
; CHECK-NEXT: adc x10, x13, xzr
219201
; CHECK-NEXT: madd x12, x0, x7, x12
220-
; CHECK-NEXT: cset w11, hs
221-
; CHECK-NEXT: add x9, x10, x9
222-
; CHECK-NEXT: add x13, x17, x18
223-
; CHECK-NEXT: and x10, x11, #0x1
202+
; CHECK-NEXT: add x9, x17, x9
224203
; CHECK-NEXT: mul x0, x0, x4
204+
; CHECK-NEXT: add x9, x9, x18
225205
; CHECK-NEXT: add x9, x9, x10
226-
; CHECK-NEXT: add x11, x13, x15
227-
; CHECK-NEXT: add x10, x12, x11
228-
; CHECK-NEXT: add x3, x10, x9
206+
; CHECK-NEXT: add x3, x12, x9
229207
; CHECK-NEXT: ret
230208
%mul = mul i256 %x, %y
231209
ret i256 %mul

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -619,25 +619,24 @@ body: |
619619
; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
620620
; GFX6-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96)
621621
; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
622+
; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
622623
; GFX6-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]]
623624
; GFX6-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV4]]
624625
; GFX6-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV3]]
625626
; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[MUL1]], [[MUL2]]
626-
; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO1]](s1)
627-
; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[UMULH]]
628-
; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO3]](s1)
629-
; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[ZEXT1]]
627+
; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UADDO]], [[UMULH]], [[UADDO1]]
628+
; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDE1]]
630629
; GFX6-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV3]]
631630
; GFX6-NEXT: [[MUL4:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV4]]
632631
; GFX6-NEXT: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]]
633632
; GFX6-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV3]]
634633
; GFX6-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV4]]
635-
; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]]
636-
; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[MUL5]]
637-
; GFX6-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[UMULH1]]
638-
; GFX6-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UMULH2]]
639-
; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD]]
640-
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[ADD5]](s32)
634+
; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]]
635+
; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL5]]
636+
; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[UMULH1]]
637+
; GFX6-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[UMULH2]]
638+
; GFX6-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UADDE2]]
639+
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDE]](s32), [[ADD4]](s32)
641640
; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96)
642641
;
643642
; GFX89-LABEL: name: test_mul_s96

0 commit comments

Comments
 (0)