llvm · arsenm · Jul 2, 2024 · Feb 27, 2020 · jayfoad · Jul 2, 2024
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5719,6 +5719,7 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
                                         ArrayRef<Register> Src1Regs,
                                         ArrayRef<Register> Src2Regs,
                                         LLT NarrowTy) {
+  const LLT S1 = LLT::scalar(1);
   MachineIRBuilder &B = MIRBuilder;
   unsigned SrcParts = Src1Regs.size();
   unsigned DstParts = DstRegs.size();
@@ -5731,6 +5732,8 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
   unsigned CarrySumPrevDstIdx;
   SmallVector<Register, 4> Factors;
 
+  const Register Zero = B.buildConstant(NarrowTy, 0).getReg(0);
+
   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
     // Collect low parts of muls for DstIdx.
     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
@@ -5755,15 +5758,15 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
     // Add all factors and accumulate all carries into CarrySum.
     if (DstIdx != DstParts - 1) {
       MachineInstrBuilder Uaddo =
-          B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
+          B.buildUAddo(NarrowTy, S1, Factors[0], Factors[1]);
       FactorSum = Uaddo.getReg(0);
-      CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
+      CarrySum = Zero;
       for (unsigned i = 2; i < Factors.size(); ++i) {
-        MachineInstrBuilder Uaddo =
-            B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
-        FactorSum = Uaddo.getReg(0);
-        MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
-        CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
+        auto Uadde =
+            B.buildUAdde(NarrowTy, S1, FactorSum, Factors[i], Uaddo.getReg(1));
+        FactorSum = Uadde.getReg(0);
+        CarrySum = B.buildUAdde(NarrowTy, S1, CarrySum, Zero, Uadde.getReg(1))
+                       .getReg(0);
       }
     } else {
       // Since value for the next index is not calculated, neither is CarrySum.

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/mul.ll b/llvm/test/CodeGen/AArch64/GlobalISel/mul.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+define i8 @mul_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: mul_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul w0, w0, w1
+; CHECK-NEXT:    ret
+  %mul = mul i8 %x, %y
+  ret i8 %mul
+}
+
+define i16 @mul_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: mul_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul w0, w0, w1
+; CHECK-NEXT:    ret
+  %mul = mul i16 %x, %y
+  ret i16 %mul
+}
+
+define i32 @mul_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: mul_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul w0, w0, w1
+; CHECK-NEXT:    ret
+  %mul = mul i32 %x, %y
+  ret i32 %mul
+}
+
+define i64 @mul_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: mul_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x0, x0, x1
+; CHECK-NEXT:    ret
+  %mul = mul i64 %x, %y
+  ret i64 %mul
+}
+
+define i96 @mul_i96(i96 %x, i96 %y) {
+; CHECK-LABEL: mul_i96:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x9, x0, x3
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    madd x9, x1, x2, x9
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    add x1, x9, x10
+; CHECK-NEXT:    ret
+  %mul = mul i96 %x, %y
+  ret i96 %mul
+}
+
+define i128 @mul_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: mul_i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x9, x0, x3
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    madd x9, x1, x2, x9
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    add x1, x9, x10
+; CHECK-NEXT:    ret
+  %mul = mul i128 %x, %y
+  ret i128 %mul
+}
+
+define i160 @mul_i160(i160 %x, i160 %y) {
+; CHECK-LABEL: mul_i160:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x8, x1, x4
+; CHECK-NEXT:    mul x9, x0, x5
+; CHECK-NEXT:    umulh x10, x0, x4
+; CHECK-NEXT:    mul x11, x2, x4
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    mul x12, x1, x5
+; CHECK-NEXT:    mul x13, x0, x6
+; CHECK-NEXT:    umulh x14, x1, x4
+; CHECK-NEXT:    adcs x1, x8, x10
+; CHECK-NEXT:    adc x9, xzr, xzr
+; CHECK-NEXT:    adds x10, x11, x12
+; CHECK-NEXT:    umulh x8, x0, x5
+; CHECK-NEXT:    cset w11, hs
+; CHECK-NEXT:    adc x10, x10, x13
+; CHECK-NEXT:    cmp w11, #1
+; CHECK-NEXT:    mul x0, x0, x4
+; CHECK-NEXT:    adc x10, x10, x14
+; CHECK-NEXT:    adc x8, x10, x8
+; CHECK-NEXT:    adc x2, x8, x9
+; CHECK-NEXT:    ret
+  %mul = mul i160 %x, %y
+  ret i160 %mul
+}
+
+define i192 @mul_i192(i192 %x, i192 %y) {
+; CHECK-LABEL: mul_i192:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x8, x1, x4
+; CHECK-NEXT:    mul x9, x0, x5
+; CHECK-NEXT:    umulh x10, x0, x4
+; CHECK-NEXT:    mul x11, x2, x4
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    mul x12, x1, x5
+; CHECK-NEXT:    mul x13, x0, x6
+; CHECK-NEXT:    umulh x14, x1, x4
+; CHECK-NEXT:    adcs x1, x8, x10
+; CHECK-NEXT:    adc x9, xzr, xzr
+; CHECK-NEXT:    adds x10, x11, x12
+; CHECK-NEXT:    umulh x8, x0, x5
+; CHECK-NEXT:    cset w11, hs
+; CHECK-NEXT:    adc x10, x10, x13
+; CHECK-NEXT:    cmp w11, #1
+; CHECK-NEXT:    mul x0, x0, x4
+; CHECK-NEXT:    adc x10, x10, x14
+; CHECK-NEXT:    adc x8, x10, x8
+; CHECK-NEXT:    adc x2, x8, x9
+; CHECK-NEXT:    ret
+  %mul = mul i192 %x, %y
+  ret i192 %mul
+}
+
+define i224 @mul_i224(i224 %x, i224 %y) {
+; CHECK-LABEL: mul_i224:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x8, x1, x4
+; CHECK-NEXT:    mul x9, x0, x5
+; CHECK-NEXT:    umulh x10, x0, x4
+; CHECK-NEXT:    mul x11, x2, x4
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    mul x12, x1, x5
+; CHECK-NEXT:    adcs x8, x8, x10
+; CHECK-NEXT:    mul x14, x2, x5
+; CHECK-NEXT:    adc x10, xzr, xzr
+; CHECK-NEXT:    mul x13, x0, x6
+; CHECK-NEXT:    adds x11, x11, x12
+; CHECK-NEXT:    umulh x15, x1, x4
+; CHECK-NEXT:    madd x14, x3, x4, x14
+; CHECK-NEXT:    umulh x16, x0, x5
+; CHECK-NEXT:    madd x12, x1, x6, x14
+; CHECK-NEXT:    cset w14, hs
+; CHECK-NEXT:    adcs x11, x11, x13
+; CHECK-NEXT:    adc x13, xzr, xzr
+; CHECK-NEXT:    cmp w14, #1
+; CHECK-NEXT:    umulh x17, x2, x4
+; CHECK-NEXT:    adcs x11, x11, x15
+; CHECK-NEXT:    adc x13, x13, xzr
+; CHECK-NEXT:    cmp w14, #1
+; CHECK-NEXT:    umulh x9, x1, x5
+; CHECK-NEXT:    adcs x11, x11, x16
+; CHECK-NEXT:    mov x1, x8
+; CHECK-NEXT:    adc x13, x13, xzr
+; CHECK-NEXT:    cmp w14, #1
+; CHECK-NEXT:    umulh x18, x0, x6
+; CHECK-NEXT:    adcs x2, x11, x10
+; CHECK-NEXT:    adc x10, x13, xzr
+; CHECK-NEXT:    madd x12, x0, x7, x12
+; CHECK-NEXT:    add x9, x17, x9
+; CHECK-NEXT:    mul x0, x0, x4
+; CHECK-NEXT:    add x9, x9, x18
+; CHECK-NEXT:    add x9, x9, x10
+; CHECK-NEXT:    add x3, x12, x9
+; CHECK-NEXT:    ret
+  %mul = mul i224 %x, %y
+  ret i224 %mul
+}
+
+define i256 @mul_i256(i256 %x, i256 %y) {
+; CHECK-LABEL: mul_i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x8, x1, x4
+; CHECK-NEXT:    mul x9, x0, x5
+; CHECK-NEXT:    umulh x10, x0, x4
+; CHECK-NEXT:    mul x11, x2, x4
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    mul x12, x1, x5
+; CHECK-NEXT:    adcs x8, x8, x10
+; CHECK-NEXT:    mul x14, x2, x5
+; CHECK-NEXT:    adc x10, xzr, xzr
+; CHECK-NEXT:    mul x13, x0, x6
+; CHECK-NEXT:    adds x11, x11, x12
+; CHECK-NEXT:    umulh x15, x1, x4
+; CHECK-NEXT:    madd x14, x3, x4, x14
+; CHECK-NEXT:    umulh x16, x0, x5
+; CHECK-NEXT:    madd x12, x1, x6, x14
+; CHECK-NEXT:    cset w14, hs
+; CHECK-NEXT:    adcs x11, x11, x13
+; CHECK-NEXT:    adc x13, xzr, xzr
+; CHECK-NEXT:    cmp w14, #1
+; CHECK-NEXT:    umulh x17, x2, x4
+; CHECK-NEXT:    adcs x11, x11, x15
+; CHECK-NEXT:    adc x13, x13, xzr
+; CHECK-NEXT:    cmp w14, #1
+; CHECK-NEXT:    umulh x9, x1, x5
+; CHECK-NEXT:    adcs x11, x11, x16
+; CHECK-NEXT:    mov x1, x8
+; CHECK-NEXT:    adc x13, x13, xzr
+; CHECK-NEXT:    cmp w14, #1
+; CHECK-NEXT:    umulh x18, x0, x6
+; CHECK-NEXT:    adcs x2, x11, x10
+; CHECK-NEXT:    adc x10, x13, xzr
+; CHECK-NEXT:    madd x12, x0, x7, x12
+; CHECK-NEXT:    add x9, x17, x9
+; CHECK-NEXT:    mul x0, x0, x4
+; CHECK-NEXT:    add x9, x9, x18
+; CHECK-NEXT:    add x9, x9, x10
+; CHECK-NEXT:    add x3, x12, x9
+; CHECK-NEXT:    ret
+  %mul = mul i256 %x, %y
+  ret i256 %mul
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir
@@ -619,25 +619,24 @@ body: |
     ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; GFX6-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96)
     ; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
+    ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]]
     ; GFX6-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV4]]
     ; GFX6-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV3]]
     ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[MUL1]], [[MUL2]]
-    ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO1]](s1)
-    ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[UMULH]]
-    ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO3]](s1)
-    ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[ZEXT1]]
+    ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UADDO]], [[UMULH]], [[UADDO1]]
+    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDE1]]
     ; GFX6-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV3]]
     ; GFX6-NEXT: [[MUL4:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV4]]
     ; GFX6-NEXT: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]]
     ; GFX6-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV3]]
     ; GFX6-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV4]]
-    ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]]
-    ; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[MUL5]]
-    ; GFX6-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[UMULH1]]
-    ; GFX6-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UMULH2]]
-    ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD]]
-    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[ADD5]](s32)
+    ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]]
+    ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL5]]
+    ; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[UMULH1]]
+    ; GFX6-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[UMULH2]]
+    ; GFX6-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UADDE2]]
+    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDE]](s32), [[ADD4]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96)
     ;
     ; GFX89-LABEL: name: test_mul_s96