|
| 1 | +// REQUIRES: arm-emulator |
| 2 | + |
| 3 | +// DEFINE: %{compile} = mlir-opt %s \ |
| 4 | +// DEFINE: --convert-vector-to-scf --convert-scf-to-cf --convert-vector-to-llvm='enable-arm-neon enable-arm-i8mm' \ |
| 5 | +// DEFINE: --expand-strided-metadata --convert-to-llvm --finalize-memref-to-llvm \ |
| 6 | +// DEFINE: --lower-affine --convert-arith-to-llvm --reconcile-unrealized-casts \ |
| 7 | +// DEFINE: -o %t |
| 8 | + |
| 9 | +// DEFINE: %{entry_point} = main |
| 10 | + |
| 11 | +// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+neon,+i8mm" \ |
| 12 | +// DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils |
| 13 | + |
| 14 | +// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s |
| 15 | + |
| 16 | +#packed_maps = [ |
| 17 | + affine_map<(m, n, k) -> (m, k)>, |
| 18 | + affine_map<(m, n, k) -> (n, k)>, |
| 19 | + affine_map<(m, n, k) -> (m, n)> |
| 20 | +] |
| 21 | + |
| 22 | +// |
| 23 | +// Test the lowering of `vector.contract` using the `LowerContractionToSMMLAPattern` |
| 24 | +// |
| 25 | +// The operation that the `vector.contract` in this test performs is matrix |
| 26 | +// multiplication with accumulate |
| 27 | +// OUT = ACC + LHS * RHS |
| 28 | +// of two 8-bit integer matrices LHS and RHS, and a 32-bit integer matrix ACC |
| 29 | +// into a 32-bit integer matrix OUT. The LHS and RHS can be sign- or zero- extended, |
| 30 | +// this test covers all the possible variants. |
| 31 | +// |
| 32 | +// Tested are calculations as well as that the relevant `ArmNeon` dialect |
| 33 | +// operations ('arm_neon.smmla`, arm_neon.ummla`, etc) are emitted. |
| 34 | +// |
| 35 | +// That pattern above handles (therefore this test prepares) input/output vectors with |
| 36 | +// specific shapes: |
| 37 | +// * LHS: vector<MxKxi8> |
| 38 | +// * RHS: vector<NxKxi8> |
| 39 | +// * ACC, OUT: vector<MxNxi32> |
| 40 | +// where the M and N are even and K is divisible by 8. |
| 41 | +// Note that the RHS is transposed. |
| 42 | +// This data layout makes it efficient to load data into SIMD |
| 43 | +// registers in the layout expected by FEAT_I8MM instructions. |
| 44 | +// Such a `vector.contract` is representative of the code we aim to generate |
| 45 | +// by vectorisation of `linalg.mmt4d`. |
| 46 | +// |
| 47 | +// In this specific test we use M == 4, N == 4, and K == 8. |
| 48 | +// |
| 49 | + |
| 50 | +// Test the operation where both LHS and RHS are interpreted as signed, hence |
| 51 | +// we ultimately emit and execute the `smmla` instruction. |
| 52 | + |
| 53 | +// CHECK-IR-LABEL: llvm.func @test_smmla |
| 54 | +// CHECK-IR-COUNT-4: arm_neon.intr.smmla |
| 55 | +func.func @test_smmla() { |
| 56 | + |
| 57 | + %c0 = arith.constant 0 : index |
| 58 | + %c0_i32 = arith.constant 0 : i32 |
| 59 | + %c0_i8 = arith.constant 0 : i8 |
| 60 | + |
| 61 | + // Accumulator test data |
| 62 | + %acc_cst = arith.constant dense<[[ -1, -9, -4, 0], |
| 63 | + [ 6, 5, 7, 2], |
| 64 | + [ -8, -7, 9, -10], |
| 65 | + [ 9, 4, -4, 0]]> : vector<4x4xi32> |
| 66 | + |
| 67 | + %acc_mem = memref.alloca() : memref<4x4xi32> |
| 68 | + vector.transfer_write %acc_cst, %acc_mem[%c0, %c0] : vector<4x4xi32>, memref<4x4xi32> |
| 69 | + %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x4xi32>, vector<4x4xi32> |
| 70 | + |
| 71 | + // LHS test data |
| 72 | + %lhs_cst = arith.constant dense<[[ -4, -4, -4, -6, 0, 1, 6, 2, -1, 4, 5, -8, 9, 5, 4, 9], |
| 73 | + [ -1, 6, 0, 7, -7, 8, 5, 8, -7, 6, -2, 1, 1, 5, -4, -4], |
| 74 | + [ 4, -10, 10, -3, 5, 3, 2, 3, -7, 9, -9, -10, 7, -8, -5, -2], |
| 75 | + [ 9, 5, 8, 9, 6, -3, -9, 7, -4, -7, -2, 7, -8, 2, 8, 7]]> : vector<4x16xi8> |
| 76 | + |
| 77 | + %lhs_mem = memref.alloca() : memref<4x16xi8> |
| 78 | + vector.transfer_write %lhs_cst, %lhs_mem[%c0, %c0] : vector<4x16xi8>, memref<4x16xi8> |
| 79 | + %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x16xi8>, vector<4x16xi8> |
| 80 | + |
| 81 | + // RHS test data |
| 82 | + %rhs_cst = arith.constant dense<[[ 1, 2, -3, 5, 10, 8, 10, -2, 1, 10, -5, 2, 4, 3, -9, 4], |
| 83 | + [ -3, -3, -3, 4, 6, -1, 0, -5, 6, 3, -1, 9, -3, 3, -2, 4], |
| 84 | + [ 1, 9, -1, 1, -5, 4, 9, -10, -1, -7, 10, -2, 0, -3, 4, 7], |
| 85 | + [ -4, -10, 8, -10, -5, -8, -6, 7, 4, -2, 10, 3, -9, 5, 2, -1]]> : vector<4x16xi8> |
| 86 | + |
| 87 | + %rhs_mem = memref.alloca() : memref<4x16xi8> |
| 88 | + vector.transfer_write %rhs_cst, %rhs_mem[%c0, %c0] : vector<4x16xi8>, memref<4x16xi8> |
| 89 | + %rhs = vector.transfer_read %rhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x16xi8>, vector<4x16xi8> |
| 90 | + |
| 91 | + |
| 92 | + // Matrix multiplication and accumulate with transposed RHS. |
| 93 | + %0 = arith.extsi %lhs : vector<4x16xi8> to vector<4x16xi32> |
| 94 | + %1 = arith.extsi %rhs : vector<4x16xi8> to vector<4x16xi32> |
| 95 | + %2 = vector.contract {indexing_maps = #packed_maps, |
| 96 | + iterator_types = ["parallel", "parallel", "reduction"], |
| 97 | + kind = #vector.kind<add>} %0, %1, %acc |
| 98 | + : vector<4x16xi32>, vector<4x16xi32> into vector<4x4xi32> |
| 99 | + |
| 100 | + // Display the result of the multiplication |
| 101 | + vector.print str "Result(SMMLA):\n" |
| 102 | + %u0 = vector.extract %2[0] : vector<4xi32> from vector<4x4xi32> |
| 103 | + %u1 = vector.extract %2[1] : vector<4xi32> from vector<4x4xi32> |
| 104 | + %u2 = vector.extract %2[2] : vector<4xi32> from vector<4x4xi32> |
| 105 | + %u3 = vector.extract %2[3] : vector<4xi32> from vector<4x4xi32> |
| 106 | + vector.print %u0 : vector<4xi32> |
| 107 | + vector.print %u1 : vector<4xi32> |
| 108 | + vector.print %u2 : vector<4xi32> |
| 109 | + vector.print %u3 : vector<4xi32> |
| 110 | + |
| 111 | + return |
| 112 | +} |
| 113 | + |
| 114 | +// Test the operation where both LHS and RHS are interpreted as unsigned, hence |
| 115 | +// we ultimately emit and execute the `ummla` instruction. |
| 116 | + |
| 117 | +// CHECK-IR-LABEL: llvm.func @test_ummla |
| 118 | +// CHECK-IR-COUNT-4: arm_neon.intr.ummla |
| 119 | +func.func @test_ummla() { |
| 120 | + |
| 121 | + %c0 = arith.constant 0 : index |
| 122 | + %c0_i32 = arith.constant 0 : i32 |
| 123 | + %c0_i8 = arith.constant 0 : i8 |
| 124 | + |
| 125 | + // Accumulator test data |
| 126 | + %acc_cst = arith.constant dense<[[39, 39, 46, 30], |
| 127 | + [22, 48, 61, 54], |
| 128 | + [41, 63, 27, 10], |
| 129 | + [37, 30, 16, 45]]> : vector<4x4xi32> |
| 130 | + |
| 131 | + %acc_mem = memref.alloca() : memref<4x4xi32> |
| 132 | + vector.transfer_write %acc_cst, %acc_mem[%c0, %c0] : vector<4x4xi32>, memref<4x4xi32> |
| 133 | + %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x4xi32>, vector<4x4xi32> |
| 134 | + |
| 135 | + // LHS test data |
| 136 | + %lhs_cst = arith.constant dense<[[ 6, 6, 38, 30, 60, 4, 42, 11, 16, 12, 30, 41, 14, 55, 47, 25], |
| 137 | + [ 2, 19, 25, 29, 15, 23, 14, 19, 9, 16, 42, 17, 58, 62, 30, 3], |
| 138 | + [62, 50, 47, 18, 3, 48, 23, 8, 43, 29, 43, 15, 6, 38, 46, 25], |
| 139 | + [32, 27, 52, 39, 47, 26, 26, 13, 23, 29, 24, 44, 23, 45, 35, 51]]> : vector<4x16xi8> |
| 140 | + |
| 141 | + %lhs_mem = memref.alloca() : memref<4x16xi8> |
| 142 | + vector.transfer_write %lhs_cst, %lhs_mem[%c0, %c0] : vector<4x16xi8>, memref<4x16xi8> |
| 143 | + %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x16xi8>, vector<4x16xi8> |
| 144 | + |
| 145 | + // RHS test data |
| 146 | + %rhs_cst = arith.constant dense<[[33, 0, 49, 34, 37, 8, 25, 19, 15, 26, 23, 18, 19, 16, 39, 33], |
| 147 | + [22, 17, 53, 58, 6, 35, 54, 23, 8, 53, 21, 27, 49, 25, 34, 12], |
| 148 | + [27, 18, 53, 53, 49, 11, 12, 39, 62, 47, 59, 29, 20, 18, 52, 25], |
| 149 | + [27, 40, 11, 52, 37, 60, 29, 44, 46, 25, 13, 33, 14, 53, 56, 39]]> : vector<4x16xi8> |
| 150 | + |
| 151 | + %rhs_mem = memref.alloca() : memref<4x16xi8> |
| 152 | + vector.transfer_write %rhs_cst, %rhs_mem[%c0, %c0] : vector<4x16xi8>, memref<4x16xi8> |
| 153 | + %rhs = vector.transfer_read %rhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x16xi8>, vector<4x16xi8> |
| 154 | + |
| 155 | + // Matrix multiplication and accumulate with transposed RHS. |
| 156 | + %0 = arith.extui %lhs : vector<4x16xi8> to vector<4x16xi32> |
| 157 | + %1 = arith.extui %rhs : vector<4x16xi8> to vector<4x16xi32> |
| 158 | + %2 = vector.contract {indexing_maps = #packed_maps, |
| 159 | + iterator_types = ["parallel", "parallel", "reduction"], |
| 160 | + kind = #vector.kind<add>} %0, %1, %acc |
| 161 | + : vector<4x16xi32>, vector<4x16xi32> into vector<4x4xi32> |
| 162 | + |
| 163 | + // Display the result of the multiplication |
| 164 | + vector.print str "Result(UMMLA):\n" |
| 165 | + %u0 = vector.extract %2[0] : vector<4xi32> from vector<4x4xi32> |
| 166 | + %u1 = vector.extract %2[1] : vector<4xi32> from vector<4x4xi32> |
| 167 | + %u2 = vector.extract %2[2] : vector<4xi32> from vector<4x4xi32> |
| 168 | + %u3 = vector.extract %2[3] : vector<4xi32> from vector<4x4xi32> |
| 169 | + vector.print %u0 : vector<4xi32> |
| 170 | + vector.print %u1 : vector<4xi32> |
| 171 | + vector.print %u2 : vector<4xi32> |
| 172 | + vector.print %u3 : vector<4xi32> |
| 173 | + |
| 174 | + return |
| 175 | +} |
| 176 | + |
| 177 | +// Test the operation where LHS is interpreted as unsigned and RHS is |
| 178 | +// interpreted as signed, hence we ultimately emit and execute the `usmmla` |
| 179 | +// instruction. |
| 180 | + |
| 181 | +// CHECK-IR-LABEL: llvm.func @test_usmmla |
| 182 | +// CHECK-IR-COUNT-4: arm_neon.intr.usmmla |
| 183 | +func.func @test_usmmla() { |
| 184 | + |
| 185 | + %c0 = arith.constant 0 : index |
| 186 | + %c0_i32 = arith.constant 0 : i32 |
| 187 | + %c0_i8 = arith.constant 0 : i8 |
| 188 | + |
| 189 | + // Accumulator test data |
| 190 | + %acc_cst = arith.constant dense<[[-50, 22, -15, 6], |
| 191 | + [ 0, -46, 32, -59], |
| 192 | + [-62, -60, -38, 17], |
| 193 | + [-50, 8, -12, 22]]> : vector<4x4xi32> |
| 194 | + |
| 195 | + %acc_mem = memref.alloca() : memref<4x4xi32> |
| 196 | + vector.transfer_write %acc_cst, %acc_mem[%c0, %c0] : vector<4x4xi32>, memref<4x4xi32> |
| 197 | + %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x4xi32>, vector<4x4xi32> |
| 198 | + |
| 199 | + // LHS test data |
| 200 | + %lhs_cst = arith.constant dense<[[ 6, 6, 38, 30, 60, 4, 42, 11, 16, 12, 30, 41, 14, 55, 47, 25], |
| 201 | + [ 2, 19, 25, 29, 15, 23, 14, 19, 9, 16, 42, 17, 58, 62, 30, 3], |
| 202 | + [62, 50, 47, 18, 3, 48, 23, 8, 43, 29, 43, 15, 6, 38, 46, 25], |
| 203 | + [32, 27, 52, 39, 47, 26, 26, 13, 23, 29, 24, 44, 23, 45, 35, 51]]> : vector<4x16xi8> |
| 204 | + |
| 205 | + %lhs_mem = memref.alloca() : memref<4x16xi8> |
| 206 | + vector.transfer_write %lhs_cst, %lhs_mem[%c0, %c0] : vector<4x16xi8>, memref<4x16xi8> |
| 207 | + %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x16xi8>, vector<4x16xi8> |
| 208 | + |
| 209 | + // RHS test data |
| 210 | + %rhs_cst = arith.constant dense<[[ -9, -10, 7, -8, -5, -2, 9, 5, 8, 9, 6, -3, -9, 7, -4, -7], |
| 211 | + [ -2, 7, -8, 2, 8, 7, 1, 2, -3, 5, 8, -2, 1, -5, 2, 4], |
| 212 | + [ 3, -9, 4, -3, -3, -3, 4, 6, -1, 0, -5, 6, 3, -1, 9, -3], |
| 213 | + [ 3, -2, 4, 1, 9, -1, 1, -5, 4, 9, -10, -1, -7, -2, 0, -3]]> : vector<4x16xi8> |
| 214 | + |
| 215 | + %rhs_mem = memref.alloca() : memref<4x16xi8> |
| 216 | + vector.transfer_write %rhs_cst, %rhs_mem[%c0, %c0] : vector<4x16xi8>, memref<4x16xi8> |
| 217 | + %rhs = vector.transfer_read %rhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x16xi8>, vector<4x16xi8> |
| 218 | + |
| 219 | + // Matrix multiplication and accumulate with transposed RHS. |
| 220 | + %0 = arith.extui %lhs : vector<4x16xi8> to vector<4x16xi32> |
| 221 | + %1 = arith.extsi %rhs : vector<4x16xi8> to vector<4x16xi32> |
| 222 | + %2 = vector.contract {indexing_maps = #packed_maps, |
| 223 | + iterator_types = ["parallel", "parallel", "reduction"], |
| 224 | + kind = #vector.kind<add>} %0, %1, %acc |
| 225 | + : vector<4x16xi32>, vector<4x16xi32> into vector<4x4xi32> |
| 226 | + |
| 227 | + // Display the result of the multiplication |
| 228 | + vector.print str "Result(USMMLA):\n" |
| 229 | + %u0 = vector.extract %2[0] : vector<4xi32> from vector<4x4xi32> |
| 230 | + %u1 = vector.extract %2[1] : vector<4xi32> from vector<4x4xi32> |
| 231 | + %u2 = vector.extract %2[2] : vector<4xi32> from vector<4x4xi32> |
| 232 | + %u3 = vector.extract %2[3] : vector<4xi32> from vector<4x4xi32> |
| 233 | + vector.print %u0 : vector<4xi32> |
| 234 | + vector.print %u1 : vector<4xi32> |
| 235 | + vector.print %u2 : vector<4xi32> |
| 236 | + vector.print %u3 : vector<4xi32> |
| 237 | + |
| 238 | + return |
| 239 | +} |
| 240 | + |
| 241 | +// Test the operation where LHS is interpreted as signed and RHS is interpreted |
| 242 | +// as unsigned. In this test we ultimately emit end execute the `usmmla` |
| 243 | +// instruction with reversed operands, see `LowerContractionToNeonI8MMPattern.cpp` |
| 244 | +// for more details. |
| 245 | + |
| 246 | +// CHECK-IR-LABEL: llvm.func @test_summla |
| 247 | +// CHECK-IR-COUNT-4: arm_neon.intr.usmmla |
| 248 | +func.func @test_summla() { |
| 249 | + |
| 250 | + %c0 = arith.constant 0 : index |
| 251 | + %c0_i32 = arith.constant 0 : i32 |
| 252 | + %c0_i8 = arith.constant 0 : i8 |
| 253 | + |
| 254 | + // Accumulator test data |
| 255 | + %acc_cst = arith.constant dense<[[-61, 52, 8, -54], |
| 256 | + [-25, -50, 22, -15], |
| 257 | + [ 6, 0, -46, 32], |
| 258 | + [-59, -62, -60, -38]]> : vector<4x4xi32> |
| 259 | + |
| 260 | + %acc_mem = memref.alloca() : memref<4x4xi32> |
| 261 | + vector.transfer_write %acc_cst, %acc_mem[%c0, %c0] : vector<4x4xi32>, memref<4x4xi32> |
| 262 | + %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x4xi32>, vector<4x4xi32> |
| 263 | + |
| 264 | + // LHS test data |
| 265 | + %lhs_cst = arith.constant dense<[[ -4, -4, -4, -6, 0, 1, 6, 2, -1, 4, 5, -8, 9, 5, 4, 9], |
| 266 | + [ -1, 6, 0, 7, -7, 8, 5, 8, -7, 6, -2, 1, 1, 5, -4, -4], |
| 267 | + [ 4, -10, -3, 5, 3, 2, 3, -7, 9, -9, -10, 7, -8, -5, -2, 9], |
| 268 | + [ 5, 8, 9, 6, -3, -9, 7, -4, -7, -2, 7, -8, 2, 8, 7, 1]]> : vector<4x16xi8> |
| 269 | + |
| 270 | + %lhs_mem = memref.alloca() : memref<4x16xi8> |
| 271 | + vector.transfer_write %lhs_cst, %lhs_mem[%c0, %c0] : vector<4x16xi8>, memref<4x16xi8> |
| 272 | + %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x16xi8>, vector<4x16xi8> |
| 273 | + |
| 274 | + // RHS test data |
| 275 | + %rhs_cst = arith.constant dense<[[12, 39, 62, 47, 59, 29, 20, 18, 52, 25, 27, 40, 11, 52, 37, 60], |
| 276 | + [29, 44, 46, 25, 13, 33, 14, 53, 56, 39, 39, 39, 46, 30, 22, 48], |
| 277 | + [61, 54, 41, 63, 27, 10, 37, 30, 16, 45, 41, 51, 39, 28, 13, 28], |
| 278 | + [21, 28, 24, 40, 46, 30, 11, 19, 9, 11, 5, 46, 19, 26, 0, 9]]> : vector<4x16xi8> |
| 279 | + |
| 280 | + %rhs_mem = memref.alloca() : memref<4x16xi8> |
| 281 | + vector.transfer_write %rhs_cst, %rhs_mem[%c0, %c0] : vector<4x16xi8>, memref<4x16xi8> |
| 282 | + %rhs = vector.transfer_read %rhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x16xi8>, vector<4x16xi8> |
| 283 | + |
| 284 | + // Matrix multiplication and accumulate with transposed RHS. |
| 285 | + %0 = arith.extsi %lhs : vector<4x16xi8> to vector<4x16xi32> |
| 286 | + %1 = arith.extui %rhs : vector<4x16xi8> to vector<4x16xi32> |
| 287 | + %2 = vector.contract {indexing_maps = #packed_maps, |
| 288 | + iterator_types = ["parallel", "parallel", "reduction"], |
| 289 | + kind = #vector.kind<add>} %0, %1, %acc |
| 290 | + : vector<4x16xi32>, vector<4x16xi32> into vector<4x4xi32> |
| 291 | + |
| 292 | + // Display the result of the multiplication |
| 293 | + vector.print str "Result(SUMMLA (i.e. USMMLA transposed)):\n" |
| 294 | + %u0 = vector.extract %2[0] : vector<4xi32> from vector<4x4xi32> |
| 295 | + %u1 = vector.extract %2[1] : vector<4xi32> from vector<4x4xi32> |
| 296 | + %u2 = vector.extract %2[2] : vector<4xi32> from vector<4x4xi32> |
| 297 | + %u3 = vector.extract %2[3] : vector<4xi32> from vector<4x4xi32> |
| 298 | + vector.print %u0 : vector<4xi32> |
| 299 | + vector.print %u1 : vector<4xi32> |
| 300 | + vector.print %u2 : vector<4xi32> |
| 301 | + vector.print %u3 : vector<4xi32> |
| 302 | + |
| 303 | + return |
| 304 | +} |
| 305 | + |
| 306 | +func.func @main() { |
| 307 | +// CHECK-LABEL: Result(SMMLA): |
| 308 | +// CHECK: ( 82, -63, 95, 11 ) |
| 309 | +// CHECK: ( 184, -81, -17, -172 ) |
| 310 | +// CHECK: ( 168, -158, -251, -133 ) |
| 311 | +// CHECK: ( -139, 40, -48, 75 ) |
| 312 | + func.call @test_smmla() : () -> () |
| 313 | + |
| 314 | +// CHECK-LABEL: Result(UMMLA): |
| 315 | +// CHECK: ( 12414, 13508, 16691, 16069 ) |
| 316 | +// CHECK: ( 8935, 13219, 13408, 13644 ) |
| 317 | +// CHECK: ( 12223, 15233, 18131, 18553 ) |
| 318 | +// CHECK: ( 14459, 16573, 19443, 19417 ) |
| 319 | + func.call @test_ummla() : () -> () |
| 320 | + |
| 321 | +// CHECK-LABEL: Result(USMMLA): |
| 322 | +// CHECK: ( 176, 483, 468, 265 ) |
| 323 | +// CHECK: ( 23, 449, 192, -727 ) |
| 324 | +// CHECK: ( -128, 563, -30, 66 ) |
| 325 | +// CHECK: ( -476, 657, 202, 334 ) |
| 326 | + func.call @test_usmmla() : () -> () |
| 327 | + |
| 328 | +// CHECK-LABEL: Result(SUMMLA (i.e. USMMLA transposed)): |
| 329 | +// CHECK: ( 300, 716, 54, -378 ) |
| 330 | +// CHECK: ( 244, 746, 1184, 689 ) |
| 331 | +// CHECK: ( 253, -655, -688, 115 ) |
| 332 | +// CHECK: ( 995, 574, 1490, 177 ) |
| 333 | + func.call @test_summla() : () -> () |
| 334 | + |
| 335 | + return |
| 336 | +} |
0 commit comments