|
1 | 1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
2 | 2 | ; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s
|
3 | 3 |
|
4 |
| -define <vscale x 4 x i32> @dotp(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| 4 | +define <vscale x 4 x i32> @dotp(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
5 | 5 | ; CHECK-LABEL: dotp:
|
6 | 6 | ; CHECK: // %bb.0: // %entry
|
7 |
| -; CHECK-NEXT: mov z2.s, #0 // =0x0 |
8 |
| -; CHECK-NEXT: udot z2.s, z0.b, z1.b |
9 |
| -; CHECK-NEXT: mov z0.d, z2.d |
| 7 | +; CHECK-NEXT: udot z0.s, z1.b, z2.b |
10 | 8 | ; CHECK-NEXT: ret
|
11 | 9 | entry:
|
12 | 10 | %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
|
13 | 11 | %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
|
14 | 12 | %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
|
15 |
| - %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %mult) |
| 13 | + %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) |
16 | 14 | ret <vscale x 4 x i32> %partial.reduce
|
17 | 15 | }
|
18 | 16 |
|
19 |
| -define <vscale x 2 x i64> @dotp_wide(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
| 17 | +define <vscale x 2 x i64> @dotp_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
20 | 18 | ; CHECK-LABEL: dotp_wide:
|
21 | 19 | ; CHECK: // %bb.0: // %entry
|
22 |
| -; CHECK-NEXT: mov z2.d, #0 // =0x0 |
23 |
| -; CHECK-NEXT: udot z2.d, z0.h, z1.h |
24 |
| -; CHECK-NEXT: mov z0.d, z2.d |
| 20 | +; CHECK-NEXT: udot z0.d, z1.h, z2.h |
25 | 21 | ; CHECK-NEXT: ret
|
26 | 22 | entry:
|
27 | 23 | %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
|
28 | 24 | %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
|
29 | 25 | %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
|
30 |
| - %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> zeroinitializer, <vscale x 8 x i64> %mult) |
| 26 | + %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
31 | 27 | ret <vscale x 2 x i64> %partial.reduce
|
32 | 28 | }
|
33 | 29 |
|
34 |
| -define <vscale x 4 x i32> @dotp_sext(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| 30 | +define <vscale x 4 x i32> @dotp_sext(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
35 | 31 | ; CHECK-LABEL: dotp_sext:
|
36 | 32 | ; CHECK: // %bb.0: // %entry
|
37 |
| -; CHECK-NEXT: mov z2.s, #0 // =0x0 |
38 |
| -; CHECK-NEXT: sdot z2.s, z0.b, z1.b |
39 |
| -; CHECK-NEXT: mov z0.d, z2.d |
| 33 | +; CHECK-NEXT: sdot z0.s, z1.b, z2.b |
40 | 34 | ; CHECK-NEXT: ret
|
41 | 35 | entry:
|
42 | 36 | %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
|
43 | 37 | %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
|
44 | 38 | %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
|
45 |
| - %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %mult) |
| 39 | + %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult) |
46 | 40 | ret <vscale x 4 x i32> %partial.reduce
|
47 | 41 | }
|
48 | 42 |
|
49 |
| -define <vscale x 2 x i64> @dotp_wide_sext(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
| 43 | +define <vscale x 2 x i64> @dotp_wide_sext(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
50 | 44 | ; CHECK-LABEL: dotp_wide_sext:
|
51 | 45 | ; CHECK: // %bb.0: // %entry
|
52 |
| -; CHECK-NEXT: mov z2.d, #0 // =0x0 |
53 |
| -; CHECK-NEXT: sdot z2.d, z0.h, z1.h |
54 |
| -; CHECK-NEXT: mov z0.d, z2.d |
| 46 | +; CHECK-NEXT: sdot z0.d, z1.h, z2.h |
55 | 47 | ; CHECK-NEXT: ret
|
56 | 48 | entry:
|
57 | 49 | %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
|
58 | 50 | %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
|
59 | 51 | %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
|
60 |
| - %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> zeroinitializer, <vscale x 8 x i64> %mult) |
| 52 | + %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
61 | 53 | ret <vscale x 2 x i64> %partial.reduce
|
62 | 54 | }
|
63 | 55 |
|
64 |
| -define <vscale x 4 x i32> @not_dotp(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) { |
| 56 | +define <vscale x 4 x i32> @not_dotp(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) { |
65 | 57 | ; CHECK-LABEL: not_dotp:
|
66 | 58 | ; CHECK: // %bb.0: // %entry
|
67 |
| -; CHECK-NEXT: and z0.h, z0.h, #0xff |
68 | 59 | ; CHECK-NEXT: and z1.h, z1.h, #0xff
|
| 60 | +; CHECK-NEXT: and z2.h, z2.h, #0xff |
69 | 61 | ; CHECK-NEXT: ptrue p0.s
|
70 |
| -; CHECK-NEXT: uunpkhi z2.s, z0.h |
71 |
| -; CHECK-NEXT: uunpkhi z3.s, z1.h |
72 |
| -; CHECK-NEXT: uunpklo z0.s, z0.h |
73 |
| -; CHECK-NEXT: uunpklo z1.s, z1.h |
74 |
| -; CHECK-NEXT: mul z2.s, z2.s, z3.s |
75 |
| -; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s |
| 62 | +; CHECK-NEXT: uunpklo z3.s, z1.h |
| 63 | +; CHECK-NEXT: uunpklo z4.s, z2.h |
| 64 | +; CHECK-NEXT: uunpkhi z1.s, z1.h |
| 65 | +; CHECK-NEXT: uunpkhi z2.s, z2.h |
| 66 | +; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s |
| 67 | +; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s |
76 | 68 | ; CHECK-NEXT: ret
|
77 | 69 | entry:
|
78 | 70 | %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
|
79 | 71 | %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
|
80 | 72 | %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide
|
81 |
| - %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> zeroinitializer, <vscale x 8 x i32> %mult) |
| 73 | + %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult) |
82 | 74 | ret <vscale x 4 x i32> %partial.reduce
|
83 | 75 | }
|
84 | 76 |
|
85 |
| -define <vscale x 2 x i64> @not_dotp_wide(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) { |
| 77 | +define <vscale x 2 x i64> @not_dotp_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) { |
86 | 78 | ; CHECK-LABEL: not_dotp_wide:
|
87 | 79 | ; CHECK: // %bb.0: // %entry
|
88 |
| -; CHECK-NEXT: and z0.s, z0.s, #0xffff |
89 | 80 | ; CHECK-NEXT: and z1.s, z1.s, #0xffff
|
| 81 | +; CHECK-NEXT: and z2.s, z2.s, #0xffff |
90 | 82 | ; CHECK-NEXT: ptrue p0.d
|
91 |
| -; CHECK-NEXT: uunpkhi z2.d, z0.s |
92 |
| -; CHECK-NEXT: uunpkhi z3.d, z1.s |
93 |
| -; CHECK-NEXT: uunpklo z0.d, z0.s |
94 |
| -; CHECK-NEXT: uunpklo z1.d, z1.s |
95 |
| -; CHECK-NEXT: mul z2.d, z2.d, z3.d |
96 |
| -; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d |
| 83 | +; CHECK-NEXT: uunpklo z3.d, z1.s |
| 84 | +; CHECK-NEXT: uunpklo z4.d, z2.s |
| 85 | +; CHECK-NEXT: uunpkhi z1.d, z1.s |
| 86 | +; CHECK-NEXT: uunpkhi z2.d, z2.s |
| 87 | +; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| 88 | +; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d |
97 | 89 | ; CHECK-NEXT: ret
|
98 | 90 | entry:
|
99 | 91 | %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
|
100 | 92 | %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
|
101 | 93 | %mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide
|
102 |
| - %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> zeroinitializer, <vscale x 4 x i64> %mult) |
| 94 | + %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult) |
103 | 95 | ret <vscale x 2 x i64> %partial.reduce
|
104 | 96 | }
|
0 commit comments